import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tabulate import tabulate
import seaborn as sns
import altair as alt
from datetime import datetime
from scipy.stats import expon, gamma, poisson
from scipy.stats import norm, gamma, weibull_min
import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing
import scipy.stats as stats
from scipy.stats import chi2_contingency
alt.data_transformers.enable("vegafusion")
DataTransformerRegistry.enable('vegafusion')
import warnings
warnings.filterwarnings('ignore')
# Reading the original dataset
df = pd.read_csv("Crime_Data_from_2020_to_Present.csv", parse_dates=['Date Rptd','DATE OCC'])
# General information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 892934 entries, 0 to 892933 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DR_NO 892934 non-null int64 1 Date Rptd 892934 non-null datetime64[ns] 2 DATE OCC 892934 non-null datetime64[ns] 3 TIME OCC 892934 non-null int64 4 AREA 892934 non-null int64 5 AREA NAME 892934 non-null object 6 Rpt Dist No 892934 non-null int64 7 Part 1-2 892934 non-null int64 8 Crm Cd 892934 non-null int64 9 Crm Cd Desc 892934 non-null object 10 Mocodes 768605 non-null object 11 Vict Age 892934 non-null int64 12 Vict Sex 774633 non-null object 13 Vict Descent 774625 non-null object 14 Premis Cd 892924 non-null float64 15 Premis Desc 892386 non-null object 16 Weapon Used Cd 309539 non-null float64 17 Weapon Desc 309539 non-null object 18 Status 892934 non-null object 19 Status Desc 892934 non-null object 20 Crm Cd 1 892923 non-null float64 21 Crm Cd 2 65079 non-null float64 22 Crm Cd 3 2200 non-null float64 23 Crm Cd 4 63 non-null float64 24 LOCATION 892934 non-null object 25 Cross Street 140980 non-null object 26 LAT 892934 non-null float64 27 LON 892934 non-null float64 dtypes: datetime64[ns](2), float64(8), int64(7), object(11) memory usage: 190.8+ MB
print("The percentage of missing values by column is:\n", (df.isnull().sum()/len(df))*100)
The percentage of missing values by column is: DR_NO 0.000000 Date Rptd 0.000000 DATE OCC 0.000000 TIME OCC 0.000000 AREA 0.000000 AREA NAME 0.000000 Rpt Dist No 0.000000 Part 1-2 0.000000 Crm Cd 0.000000 Crm Cd Desc 0.000000 Mocodes 13.923649 Vict Age 0.000000 Vict Sex 13.248572 Vict Descent 13.249467 Premis Cd 0.001120 Premis Desc 0.061371 Weapon Used Cd 65.334616 Weapon Desc 65.334616 Status 0.000000 Status Desc 0.000000 Crm Cd 1 0.001232 Crm Cd 2 92.711779 Crm Cd 3 99.753621 Crm Cd 4 99.992945 LOCATION 0.000000 Cross Street 84.211599 LAT 0.000000 LON 0.000000 dtype: float64
df.duplicated().sum()
0
# 1. Removing the rows with Premis Desc, Vict Sex and Vict Descent equal to NA
df_clean = df.dropna(subset=['Vict Sex', 'Vict Descent', 'Premis Desc'])
# 2. Removing rows with age lower or equal than zero
df_clean = df_clean[df_clean['Vict Age'] > 0]
# 3. Replacing 'Crm Cd 1' NA's values with 'Crm Cd'
# Identify the indexes of missing values in 'Crm Cd 1' column
missing_indexes = df_clean[df_clean['Crm Cd 1'].isna()].index
# Replace missing values in 'Crm Cd 1' column with corresponding values from 'Crm Cd'
df_clean.loc[missing_indexes, 'Crm Cd 1'] = df_clean.loc[missing_indexes, 'Crm Cd']
# 4. Removing rows with LAT and Lon equal to zero
# Get the index of rows where both 'LAT' and 'LON' are equal to 0
index_to_drop = df_clean[(df_clean['LAT'] == 0) & (df_clean['LON'] == 0)].index
# Drop the rows with the specified index
df_clean.drop(index=index_to_drop, inplace=True)
# 5. Removing columns
df_clean = df_clean.drop(columns=['DR_NO','Part 1-2', 'Mocodes', 'Weapon Used Cd', 'Weapon Desc', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Cross Street',
'AREA', 'Status', 'Crm Cd', 'Premis Cd'])
print("Data shape\n", df_clean.shape)
print("\nMissing values\n", df_clean.isna().sum())
print("\nInformation\n")
print(df_clean.info())
Data shape (666842, 15) Missing values Date Rptd 0 DATE OCC 0 TIME OCC 0 AREA NAME 0 Rpt Dist No 0 Crm Cd Desc 0 Vict Age 0 Vict Sex 0 Vict Descent 0 Premis Desc 0 Status Desc 0 Crm Cd 1 0 LOCATION 0 LAT 0 LON 0 dtype: int64 Information <class 'pandas.core.frame.DataFrame'> Int64Index: 666842 entries, 1 to 892931 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date Rptd 666842 non-null datetime64[ns] 1 DATE OCC 666842 non-null datetime64[ns] 2 TIME OCC 666842 non-null int64 3 AREA NAME 666842 non-null object 4 Rpt Dist No 666842 non-null int64 5 Crm Cd Desc 666842 non-null object 6 Vict Age 666842 non-null int64 7 Vict Sex 666842 non-null object 8 Vict Descent 666842 non-null object 9 Premis Desc 666842 non-null object 10 Status Desc 666842 non-null object 11 Crm Cd 1 666842 non-null float64 12 LOCATION 666842 non-null object 13 LAT 666842 non-null float64 14 LON 666842 non-null float64 dtypes: datetime64[ns](2), float64(3), int64(3), object(7) memory usage: 81.4+ MB None
#df_clean.to_csv('clean_data.csv', index=False)
# Data Loading and DataType Changing
crime_data = pd.read_csv('crime_clean_data.csv', parse_dates=['Date Rptd','DATE OCC','TIME_OCCURRENCE'])
crime_data['DUMMY TIME OCC'] = pd.to_datetime(crime_data['TIME OCC'], format='%H%M', errors='coerce')
# Adding new columns for Analusing purpose
crime_data['Date Rptd Year'] = crime_data['Date Rptd'].dt.year
crime_data['Date Rptd Month'] = crime_data['Date Rptd'].dt.month
crime_data['Date Occured Year'] = crime_data['DATE OCC'].dt.year
crime_data['Date Occured Month'] = crime_data['DATE OCC'].dt.month
crime_data['Date Occured Day'] = crime_data['DATE OCC'].dt.day_name()
#crime_data_sorted = crime_data.sort_values(by='DUMMY TIME OCC')
def categorize_time(time):
hour = time.hour
if 6 <= hour <= 18:
return 'Daytime'
else:
return 'Nighttime'
crime_data['Time Category'] = crime_data['TIME_OCCURRENCE'].apply(categorize_time)
daytime_data = crime_data[crime_data['Time Category'] == 'Daytime']
nighttime_data = crime_data[crime_data['Time Category'] == 'Nighttime']
crime_data['response_rate_in_day'] = abs((crime_data['DATE OCC'] - crime_data['Date Rptd']).dt.days)
crime_data['response_rate_in_hour'] = abs((crime_data['DATE OCC'] - crime_data['Date Rptd']).dt.total_seconds()/3600)
crime_data['OccDateTimeCombined'] = (crime_data['DATE OCC']).astype('str') + " "+ (crime_data['TIME_OCCURRENCE'].dt.time).astype('str')
crime_data['OccDateTimeCombined'] = pd.to_datetime(crime_data['OccDateTimeCombined'])
time_bins = pd.cut(crime_data['OccDateTimeCombined'].dt.hour, bins=range(0, 25, 1), right=False)
crime_data['time occurance bins'] = time_bins.values.astype(str)
crime_data['time occurance bins h'] =[binval.right for binval in time_bins.values]
#Victim Age
conditions = [
crime_data['Vict Age'] <= 5,
(crime_data['Vict Age'] >= 6) & (crime_data['Vict Age'] <= 12),
(crime_data['Vict Age'] >= 13) & (crime_data['Vict Age'] <= 17),
(crime_data['Vict Age'] >= 18) & (crime_data['Vict Age'] <= 21),
(crime_data['Vict Age'] >= 22) & (crime_data['Vict Age'] <= 40),
(crime_data['Vict Age'] >= 41) & (crime_data['Vict Age'] <= 65),
(crime_data['Vict Age'] >= 66)
]
choices = ['Early Childhood(<5)', 'Middle Childhood(6-13)', 'Early Adolescence(13-17)', 'Late Adolescence(18-21)',
'Early Adulthood(22-40)', 'Middle Adulthood(41-65)', 'Late Adulthood(>65)']
crime_data['Victim_Age_New'] = np.select(conditions, choices, default='UNKNOWN')
#Crime Category
crime_data['Crime Category'] = 'Other'
def categorize_fraud(row):
fraud_keywords = ['FRAUD', 'FORGERY', 'EMBEZZLEMENT', 'BRIBERY', 'DISHONEST', 'FORGERY', 'COUNTERFEIT', 'FALSE']
for keyword in fraud_keywords:
if keyword in row['Crm Cd Desc']:
return 'Fraud and White-Collar Crimes'
return 'Other'
# Define functions to categorize crimes
def categorize_theft(row):
theft_keywords = ['THEFT', 'BURGLARY', 'ROBBERY', 'FRAUD', 'STEALING', 'EMBEZZLEMENT', 'PICKPOCKET', 'STOLEN']
for keyword in theft_keywords:
if keyword in row['Crm Cd Desc']:
return 'Theft and Robbery'
return 'Other'
def categorize_assault(row):
assault_keywords = ['ASSAULT', 'BATTERY', 'THROWING', 'THREATENING']
for keyword in assault_keywords:
if keyword in row['Crm Cd Desc']:
return 'Assault and Violence'
return 'Other'
def categorize_sexual_offenses(row):
sexual_keywords = ['SEXUAL', 'SEX', 'RAPE', 'PROSTITUTION', 'INDECENT EXPOSURE', 'RAPE', 'SEXUAL', 'LEWD ', 'INDECENT', 'PIMPING', 'PORNOGRAPHY', 'TRAFFICKING']
for keyword in sexual_keywords:
if keyword in row['Crm Cd Desc']:
return 'Sexual Offenses'
return 'Other'
def categorize_vandalism(row):
vandalism_keywords = ['VANDALISM', 'ARSON', 'PROPERTY', 'DISRUPT']
for keyword in vandalism_keywords:
if keyword in row['Crm Cd Desc']:
return 'Vandalism and Property Damage'
return 'Other'
crime_data['Crime Category'] = crime_data.apply(lambda row: categorize_fraud(row) if row['Crime Category'] == 'Other' else row['Crime Category'], axis=1)
crime_data['Crime Category'] = crime_data.apply(lambda row: categorize_theft(row) if row['Crime Category'] == 'Other' else row['Crime Category'], axis=1)
crime_data['Crime Category'] = crime_data.apply(lambda row: categorize_assault(row) if row['Crime Category'] == 'Other' else row['Crime Category'], axis=1)
crime_data['Crime Category'] = crime_data.apply(lambda row: categorize_sexual_offenses(row) if row['Crime Category'] == 'Other' else row['Crime Category'], axis=1)
crime_data['Crime Category'] = crime_data.apply(lambda row: categorize_vandalism(row) if row['Crime Category'] == 'Other' else row['Crime Category'], axis=1)
# Vict race/ethnic
crime_data['Vict_Descent_New'] = np.where(crime_data['Vict Descent'] == 'H', 'Latino and Hispanic',
np.where(crime_data['Vict Descent'] == 'W', 'White',
np.where(crime_data['Vict Descent'] == 'B', 'Black', 'Other')))
# Victim Sex
conditions2 = [
crime_data['Vict Sex'] == 'M',
crime_data['Vict Sex'] == 'F',
(crime_data['Vict Sex'] == 'H') | (crime_data['Vict Sex'] == 'X')
]
choices2 = ['Male', 'Female', 'Unknown']
crime_data['Victim_Sex_New'] = np.select(conditions2, choices2, default='UNKNOWN')
# Crime Location
conditions3 = [
crime_data['Premis Desc'] == 'SINGLE FAMILY DWELLING',
crime_data['Premis Desc'] == 'MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)',
crime_data['Premis Desc'].isin(['STREET', 'SIDEWALK', 'ALLEY', 'PARK/PLAYGROUND']),
crime_data['Premis Desc'].isin(['VEHICLE, PASSENGER/TRUCK', 'TRANSPORTATION FACILITY (AIRPORT)', 'MTA BUS', 'BUS STOP', 'LA UNION STATION (NOT LINE SPECIFIC)']),
crime_data['Premis Desc'].isin(['PARKING LOT', 'GARAGE/CARPORT', 'DRIVEWAY', 'PARKING', 'PORCH, RESIDENTIAL', 'PARKING UNDERGROUND/BUILDING',
'OTHER/OUTSIDE']),
crime_data['Premis Desc'].isin(['OTHER BUSINESS', 'RESTAURANT/FAST FOOD', 'DEPARTMENT STORE', 'HOTEL', 'MARKET', 'YARD (RESIDENTIAL/BUSINESS)',
'CLOTHING STORE', 'GAS STATION','PUBLIC STORAGE', 'BANK', 'OTHER STORE', 'MOTEL', 'AUTOMATED TELLER MACHINE (ATM)',
'MINI-MART', 'DRUG STORE', 'HEALTH SPA/GYM', 'LIQUOR STORE', 'SHOPPING MALL (COMMON AREA)', 'LAUNDROMAT',
"COFFEE SHOP (STARBUCKS, COFFEE BEAN, PEET'S, ETC.)", 'BAR/COCKTAIL/NIGHTCLUB', 'NIGHT CLUB (OPEN EVENINGS ONLY)',
'MAIL BOX']),
crime_data['Premis Desc'].isin(['OTHER RESIDENCE', 'CONDOMINIUM/TOWNHOUSE']),
crime_data['Premis Desc'].isin(['HIGH SCHOOL', 'JUNIOR HIGH SCHOOL', 'ELEMENTARY SCHOOL', 'COLLEGE/JUNIOR COLLEGE/UNIVERSITY'])
]
choices3 = ['SINGLE FAMILY DWELLING', 'MULTI-UNIT DWELLING' ,'STREET', 'TRANSPORTATION', 'PARKING', 'COMMERCIAL', 'OTHER RESIDENCE','CAMPUS']
# Apply conditions and assign values to the new column
crime_data['CRIME_PLACE_NEW'] = np.select(conditions3, choices3, default='OTHER')
#General Data representation
print("Shape of the dataset:", crime_data.shape)
print("\nColumn names:", crime_data.columns)
print("\nData types of each column:\n", crime_data.dtypes)
print("\nSummary statistics for numerical columns:\n", crime_data.describe())
Shape of the dataset: (666842, 38)
Column names: Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA NAME',
'Rpt Dist No', 'Crm Cd', 'Crm Cd Desc', 'Vict Age', 'Vict Sex',
'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status Desc', 'Crm Cd 1',
'LOCATION', 'LAT', 'LON', 'TIME_OCCURRENCE', 'Year', 'Month',
'DUMMY TIME OCC', 'Date Rptd Year', 'Date Rptd Month',
'Date Occured Year', 'Date Occured Month', 'Date Occured Day',
'Time Category', 'response_rate_in_day', 'response_rate_in_hour',
'OccDateTimeCombined', 'time occurance bins', 'time occurance bins h',
'Victim_Age_New', 'Crime Category', 'Vict_Descent_New',
'Victim_Sex_New', 'CRIME_PLACE_NEW'],
dtype='object')
Data types of each column:
DR_NO int64
Date Rptd datetime64[ns]
DATE OCC datetime64[ns]
TIME OCC int64
AREA NAME object
Rpt Dist No int64
Crm Cd int64
Crm Cd Desc object
Vict Age int64
Vict Sex object
Vict Descent object
Premis Cd float64
Premis Desc object
Status Desc object
Crm Cd 1 float64
LOCATION object
LAT float64
LON float64
TIME_OCCURRENCE datetime64[ns]
Year int64
Month int64
DUMMY TIME OCC datetime64[ns]
Date Rptd Year int64
Date Rptd Month int64
Date Occured Year int64
Date Occured Month int64
Date Occured Day object
Time Category object
response_rate_in_day int64
response_rate_in_hour float64
OccDateTimeCombined datetime64[ns]
time occurance bins object
time occurance bins h int64
Victim_Age_New object
Crime Category object
Vict_Descent_New object
Victim_Sex_New object
CRIME_PLACE_NEW object
dtype: object
Summary statistics for numerical columns:
DR_NO TIME OCC Rpt Dist No Crm Cd \
count 6.668420e+05 666842.000000 666842.000000 666842.000000
mean 2.174547e+08 1324.299558 1115.406512 499.051063
std 1.149155e+07 652.479882 616.305360 222.071159
min 1.030447e+07 1.000000 101.000000 110.000000
25% 2.103145e+08 900.000000 585.000000 330.000000
50% 2.203204e+08 1400.000000 1135.000000 440.000000
75% 2.302209e+08 1845.000000 1623.000000 626.000000
max 2.421054e+08 2359.000000 2199.000000 956.000000
Vict Age Premis Cd Crm Cd 1 LAT \
count 666842.000000 666842.000000 666842.000000 666842.000000
mean 39.576044 334.730074 498.733717 34.075298
std 15.601200 216.670941 221.825524 0.110089
min 2.000000 101.000000 110.000000 33.706100
25% 28.000000 104.000000 330.000000 34.015500
50% 37.000000 404.000000 440.000000 34.059100
75% 50.000000 502.000000 626.000000 34.165400
max 99.000000 971.000000 956.000000 34.334300
LON Year Month Date Rptd Year \
count 666842.000000 666842.000000 666842.000000 666842.000000
mean -118.356406 2021.600324 6.405340 2021.632465
std 0.105361 1.143860 3.485874 1.147119
min -118.667600 2020.000000 1.000000 2020.000000
25% -118.433175 2021.000000 3.000000 2021.000000
50% -118.324700 2022.000000 6.000000 2022.000000
75% -118.275500 2023.000000 9.000000 2023.000000
max -118.155400 2024.000000 12.000000 2024.000000
Date Rptd Month Date Occured Year Date Occured Month \
count 666842.000000 666842.000000 666842.000000
mean 6.416731 2021.600324 6.405340
std 3.485803 1.143860 3.485874
min 1.000000 2020.000000 1.000000
25% 3.000000 2021.000000 3.000000
50% 6.000000 2022.000000 6.000000
75% 9.000000 2023.000000 9.000000
max 12.000000 2024.000000 12.000000
response_rate_in_day response_rate_in_hour time occurance bins h
count 666842.000000 666842.000000 666842.000000
mean 12.380555 297.133318 14.059357
std 66.257251 1590.174028 6.520230
min 0.000000 0.000000 1.000000
25% 0.000000 0.000000 10.000000
50% 0.000000 0.000000 15.000000
75% 2.000000 48.000000 19.000000
max 1483.000000 35592.000000 24.000000
# Summary statistics
div = "-----------"
print("DATE OCC")
print(div)
print(crime_data['DATE OCC'].describe(),"\n")
print("Date Rptd")
print(div)
print(crime_data['Date Rptd'].describe(),"\n")
print("AREA NAME")
print(div)
print(crime_data['AREA NAME'].describe(),"\n")
print('TIME OCC')
print(div)
print(crime_data['TIME OCC'].describe(),"\n")
print('Victim_Sex_New')
print(div)
print(crime_data['Victim_Sex_New'].describe(),"\n")
print('Vict_Descent_New')
print(div)
print(crime_data['Vict_Descent_New'].describe(),"\n")
print('CRIME_PLACE_NEW')
print(div)
print(crime_data['CRIME_PLACE_NEW'].describe(),"\n")
DATE OCC ----------- count 666842 unique 1497 top 2020-01-01 00:00:00 freq 998 first 2020-01-01 00:00:00 last 2024-02-05 00:00:00 Name: DATE OCC, dtype: object Date Rptd ----------- count 666842 unique 1497 top 2023-02-03 00:00:00 freq 776 first 2020-01-01 00:00:00 last 2024-02-05 00:00:00 Name: Date Rptd, dtype: object AREA NAME ----------- count 666842 unique 21 top Central freq 47350 Name: AREA NAME, dtype: object TIME OCC ----------- count 666842.000000 mean 1324.299558 std 652.479882 min 1.000000 25% 900.000000 50% 1400.000000 75% 1845.000000 max 2359.000000 Name: TIME OCC, dtype: float64 Victim_Sex_New ----------- count 666842 unique 3 top Male freq 336084 Name: Victim_Sex_New, dtype: object Vict_Descent_New ----------- count 666842 unique 4 top Latino and Hispanic freq 266866 Name: Vict_Descent_New, dtype: object CRIME_PLACE_NEW ----------- count 666842 unique 9 top STREET freq 169219 Name: CRIME_PLACE_NEW, dtype: object
mean_value = crime_data['AREA NAME'].value_counts().mean()
plt.figure(figsize=(10, 3))
crime_data['AREA NAME'].value_counts().plot(kind='bar',color='black',label="distribution of crime")
plt.axhline(y=mean_value, color='red', linestyle='--', label='Mean') # Draw mean line
plt.title('Distribution of crimes across Area')
plt.xlabel('AREA NAME')
plt.ylabel('Number of Crimes')
plt.legend()
plt.show()
# to get area name that has crime rate over the mean
area_with_crime_greater_than_mean = crime_data['AREA NAME'].value_counts()[crime_data['AREA NAME'].value_counts()>mean_value]
print("Area with Crime rate over the mean (Hotspot): \n",area_with_crime_greater_than_mean.index.to_list())
# to calculate percentage of crime in 9 hotspot that we found above
print("\nNumber of Hotspots: \n", len(area_with_crime_greater_than_mean))
print("\nPercentage of crime in 9 Hotspots: \n", (area_with_crime_greater_than_mean.sum()/crime_data['AREA NAME'].value_counts().sum())*100)
#to Caluculate percentage of crime in Each Area
print("\nPercentage of Crime in each area: \n")
print("\nTotal Number of Area: ",len(crime_data['AREA NAME'].value_counts()))
print(crime_data['AREA NAME'].value_counts()/crime_data['AREA NAME'].value_counts().sum()*100)
Area with Crime rate over the mean (Hotspot): ['Central', '77th Street', 'Southwest', 'Pacific', 'Hollywood', 'Southeast', 'Olympic', 'N Hollywood', 'Wilshire'] Number of Hotspots: 9 Percentage of crime in 9 Hotspots: 50.36350439834324 Percentage of Crime in each area: Total Number of Area: 21 Central 7.100633 77th Street 6.353529 Southwest 6.328486 Pacific 5.598927 Hollywood 5.301256 Southeast 5.065668 Olympic 4.943450 N Hollywood 4.836078 Wilshire 4.835478 Newton 4.660024 Topanga 4.641279 Rampart 4.589843 West LA 4.552653 Van Nuys 4.498817 West Valley 4.188248 Mission 4.066930 Northeast 4.035289 Devonshire 3.907072 Harbor 3.786054 Hollenbeck 3.362866 Foothill 3.347420 Name: AREA NAME, dtype: float64
# Distibution of Date Rptd
plt.figure(figsize=(12, 3))
crime_data['Date Rptd'].hist(bins=50, color='black')
plt.title('Distribution of Date Rptd')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
# Plot line plot
date_counts = crime_data['Date Rptd'].value_counts().sort_index()
plt.figure(figsize=(12, 3))
date_counts.plot(color='black')
plt.title('Distribution of Date Rptd')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
#for Year
bin_edges = range(int(crime_data['Date Rptd Year'].min()), int(crime_data['Date Rptd Year'].max()) + 2)
plt.figure(figsize=(12, 3))
crime_data['Date Rptd Year'].hist(bins=bin_edges, color='black', width=.9)
plt.title('Distribution of Date Rptd Year')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
#for Month
bin_edges = range(int(crime_data['Date Rptd Month'].min()), int(crime_data['Date Rptd Month'].max()) + 2)
plt.figure(figsize=(12, 3))
mean_value = crime_data['Date Rptd Month'].value_counts().mean()
plt.axhline(y=mean_value, color='red', linestyle='--', label='Mean')
crime_data['Date Rptd Month'].hist(bins=bin_edges, color='black', width=0.8)
plt.title('Distribution of Date Rptd Month')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
#for year and month together
hist = alt.Chart(crime_data).mark_bar(color="black").encode(
x=alt.X('Date Rptd Month:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).facet(
column='Date Rptd Year:O'
)
hist
#Date Rpdt Summary
print("Percentage of crime reported each Year: ")
print(crime_data['Date Rptd Year'].value_counts()/crime_data['Date Rptd Year'].value_counts().sum()*100)
print("\nPercentage of crime reported each Year: ")
print(crime_data['Date Rptd Month'].value_counts()/crime_data['Date Rptd Year'].value_counts().sum()*100)
#Excluding 2024
print("Excluding 2024\n")
data_excluded_2024 = crime_data[crime_data['Date Rptd Year']<2024]
print("\nPercentage of crime reported each Year: ")
print(data_excluded_2024['Date Rptd Year'].value_counts()/data_excluded_2024['Date Rptd Year'].value_counts().sum()*100)
print("\nPercentage of crime each Year: ")
print(data_excluded_2024['Date Rptd Month'].value_counts()/data_excluded_2024['Date Rptd Year'].value_counts().sum()*100)
Percentage of crime reported each Year: 2022 26.872782 2023 25.634558 2021 23.503019 2020 21.716089 2024 2.273552 Name: Date Rptd Year, dtype: float64 Percentage of crime reported each Year: 1 10.061604 7 8.496315 8 8.496166 10 8.422685 6 8.231485 5 8.208991 9 8.157255 12 8.072677 2 8.023790 3 8.019591 11 7.982251 4 7.827191 Name: Date Rptd Month, dtype: float64 Excluding 2024 Percentage of crime reported each Year: 2022 27.497963 2023 26.230932 2021 24.049804 2020 22.221302 Name: Date Rptd Year, dtype: float64 Percentage of crime each Year: 7 8.693978 8 8.693824 10 8.618634 6 8.422986 5 8.399969 9 8.347029 12 8.260483 3 8.206162 1 8.198490 11 8.167953 4 8.009287 2 7.981206 Name: Date Rptd Month, dtype: float64
# Distibution of Date Occured
plt.figure(figsize=(12, 3))
crime_data['DATE OCC'].hist(bins=50, color='black')
plt.title('Distribution of Date Occurred')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
# Plot line plot
date_counts = crime_data['DATE OCC'].value_counts().sort_index()
plt.figure(figsize=(12, 3))
date_counts.plot(color='black')
plt.title('Distribution of Date Occurred')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
#for year
bin_edges = range(int(crime_data['Date Occured Year'].min()), int(crime_data['Date Occured Year'].max()) + 2)
plt.figure(figsize=(12, 3))
crime_data['Date Occured Year'].hist(bins=bin_edges, color='black', width=.9)
plt.title('Distribution of Crime Occurred every Year')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.show()
#for Month
bin_edges = range(int(crime_data['Date Occured Month'].min()), int(crime_data['Date Occured Month'].max()) + 2)
plt.figure(figsize=(12, 3))
mean_value = crime_data['Date Occured Month'].value_counts().mean()
plt.axhline(y=mean_value, color='red', linestyle='--', label='Mean')
crime_data['Date Occured Month'].hist(bins=bin_edges, color='black', width=0.8)
#plt.xlim(1, 12)
plt.title('Distribution of Crime Occured each Month')
plt.xlabel('Date Reported')
plt.ylabel('Frequency')
plt.legend()
plt.show()
#for year and month together
hist = alt.Chart(crime_data).mark_bar(color="black").encode(
x=alt.X('Date Occured Month:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))#,
).properties(height = 100, width = 100, title="Crime occured every month breakdown by year").facet(
facet='Date Occured Year:O',
columns=3
).configure_axis(
labelAngle=0
)
hist
#Date Occured Summary
print("\n Percentage of crime occured each Year: ")
print(crime_data['Date Occured Year'].value_counts()/crime_data['Date Occured Year'].value_counts().sum()*100)
print("\n Percentage of crime occured each Month: ")
print(crime_data['Date Occured Month'].value_counts()/crime_data['Date Occured Year'].value_counts().sum()*100)
#Excluding 2024
print("\nExcluding 2024:")
data_excluded_2024 = crime_data[crime_data['Date Occured Year']<2024]
print("\n Percentage of crime occured each Year: ")
print(data_excluded_2024['Date Occured Year'].value_counts()/data_excluded_2024['Date Occured Year'].value_counts().sum()*100)
print("\n Percentage of crime occured each Month: ")
print(data_excluded_2024['Date Occured Month'].value_counts()/data_excluded_2024['Date Occured Year'].value_counts().sum()*100)
counts = data_excluded_2024['Date Occured Year'].value_counts().sort_index().reset_index()
counts.columns = ['Year', 'Count']
counts['Percentage'] = counts['Count'] / counts['Count'].sum() * 100
# Create Altair Chart
chart = alt.Chart(counts).mark_line(point=True,color='black').encode(
x=alt.X('Year:N', title='Year'),
y=alt.Y('Percentage:Q', title='Percentage', scale=alt.Scale(domain=[22, counts['Percentage'].max()])),
tooltip=['Year:N', 'Percentage:Q']
).properties(
width=200,
height=100,
title='Percentage of Crime Occurrences by Year'
).configure_axis(
labelAngle=0
)
chart
Percentage of crime occured each Year: 2022 26.787005 2023 25.001275 2021 23.677573 2020 22.589909 2024 1.944239 Name: Date Occured Year, dtype: float64 Percentage of crime occured each Month: 1 10.141083 7 8.519409 10 8.479520 8 8.442630 6 8.208691 5 8.204942 9 8.142708 3 8.051532 2 8.018991 12 8.017042 11 7.909820 4 7.863632 Name: Date Occured Month, dtype: float64 Excluding 2024: Percentage of crime occured each Year: 2022 27.318135 2023 25.496997 2021 24.147049 2020 23.037819 Name: Date Occured Year, dtype: float64 Percentage of crime occured each Month: 7 8.688331 10 8.647651 8 8.610029 1 8.523010 6 8.371452 5 8.367629 9 8.304161 3 8.211177 12 8.176003 11 8.066655 4 8.019551 2 8.014351 Name: Date Occured Month, dtype: float64
table_data2 = round(crime_data['CRIME_PLACE_NEW'].value_counts(normalize=True), 3)*100
# Convert table data to a DataFrame for tabulation
table_df2 = pd.DataFrame(table_data2).reset_index()
table_df2.columns = ['CRIME_PLACE_NEW', 'Percentage']
table_df2 = table_df2.rename(columns={'CRIME_PLACE_NEW': 'Crime Place', 'Percentage': 'Percentage'})
# Display the table using tabulate
print(tabulate(table_df2, headers='keys', tablefmt='fancy_grid', showindex=False))
╒════════════════════════╤══════════════╕ │ Crime Place │ Percentage │ ╞════════════════════════╪══════════════╡ │ STREET │ 25.4 │ ├────────────────────────┼──────────────┤ │ SINGLE FAMILY DWELLING │ 22.1 │ ├────────────────────────┼──────────────┤ │ MULTI-UNIT DWELLING │ 15.6 │ ├────────────────────────┼──────────────┤ │ PARKING │ 11.8 │ ├────────────────────────┼──────────────┤ │ COMMERCIAL │ 11.6 │ ├────────────────────────┼──────────────┤ │ OTHER │ 6.7 │ ├────────────────────────┼──────────────┤ │ TRANSPORTATION │ 4.8 │ ├────────────────────────┼──────────────┤ │ OTHER RESIDENCE │ 1.2 │ ├────────────────────────┼──────────────┤ │ CAMPUS │ 0.8 │ ╘════════════════════════╧══════════════╛
crime_place_grouped = crime_data.groupby('CRIME_PLACE_NEW').size().sort_values(ascending=False).reset_index(name='COUNT')
alt.Chart(crime_place_grouped, title='Location where the crime took place').mark_bar(color='black').encode(
alt.X('COUNT', title='', scale=alt.Scale(domain=[0,170000])),
alt.Y('CRIME_PLACE_NEW', sort='x', title='')
)
# --- Original Age column ---
# Set up the figure and axes
plt.figure(figsize=(4, 3))
# Plot the histogram
sns.histplot(crime_data['Vict Age'], bins=50, color='skyblue', edgecolor='black', kde=False, stat='density')
# Fit a normal distribution to the data
mu, std = norm.fit(crime_data['Vict Age'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p_normal = norm.pdf(x, mu, std)
# Plot the normal curve
plt.plot(x, p_normal, 'r--', label='Normal', linewidth=2)
# Fit a gamma distribution to the data
params_gamma = gamma.fit(crime_data['Vict Age'])
p_gamma = gamma.pdf(x, *params_gamma)
# Plot the gamma curve
plt.plot(x, p_gamma, 'g--', label='Gamma', linewidth=2)
# Fit a Weibull distribution to the data
params_weibull = weibull_min.fit(crime_data['Vict Age'])
p_weibull = weibull_min.pdf(x, *params_weibull)
# Plot the Weibull curve
plt.plot(x, p_weibull, 'm--', label='Weibull', linewidth=2)
# Add legend
plt.legend()
# Add labels and title
plt.title('')
plt.xlabel('Victim Age')
plt.ylabel('Density')
# Show plot
plt.show()
# --- New grouped Age ---
# Apply conditions and assign values to the new column
crime_data['Victim_Age_New'].value_counts()
Early Adulthood(22-40) 327913 Middle Adulthood(41-65) 231357 Late Adulthood(>65) 45263 Late Adolescence(18-21) 38727 Early Adolescence(13-17) 15941 Middle Childhood(6-13) 5769 Early Childhood(<5) 1872 Name: Victim_Age_New, dtype: int64
print("\nUnique Categories for Victim Sex\n")
crime_data['Vict Sex'].value_counts()
victim_sex_original = crime_data.groupby('Vict Sex').size().sort_values(ascending=False).reset_index(name='COUNT')
print("\nCount for Victim Sext\n")
print(victim_sex_original)
Unique Categories for Victim Sex Count for Victim Sext Vict Sex COUNT 0 M 336084 1 F 322343 2 X 8320 3 H 95
# New column "Vict_Sex_New", grouping H in X='Other'
crime_data['Victim_Sex_New'] = np.select(conditions2, choices2, default='UNKNOWN')
round(crime_data['Victim_Sex_New'].value_counts(normalize=True),3)*100
Male 50.4 Female 48.3 Unknown 1.3 Name: Victim_Sex_New, dtype: float64
color_scale = alt.Scale(domain=['Male', 'Female', 'Unknown'], range=['#6baed6', '#ce6dbd', '#969696'])
victim_sex_grouped = crime_data.groupby('Victim_Sex_New').size().sort_values(ascending=False).reset_index(name='COUNT')
victim_sex = alt.Chart(victim_sex_grouped, title='Victim Sex').mark_bar(color='black').encode(
alt.X('COUNT', title=''),
alt.Y('Victim_Sex_New', sort='x', title=''),
alt.Color('Victim_Sex_New', scale=color_scale, legend=None)
)
victim_sex
# Original Codes
# A - Other Asian B - Black C - Chinese D - Cambodian F - Filipino G - Guamanian H - Hispanic/Latin/Mexican
# I - American Indian/Alaskan Native J - Japanese K - Korean L - Laotian O - Other P - Pacific Islander S - Samoan U - Hawaiian
# V - Vietnamese W - White **X - Unknown** Z - Asian Indian
print("\nUnique Categories for Victim Descent\n")
print(crime_data['Vict Descent'].nunique())
victim_descent_original = crime_data.groupby('Vict Descent').size().sort_values(ascending=False).reset_index(name='COUNT')
print("\nCount for Victim Descent\n")
print(victim_descent_original)
Unique Categories for Victim Descent 20 Count for Victim Descent Vict Descent COUNT 0 H 266866 1 W 167244 2 B 124078 3 O 58818 4 A 19247 5 X 14351 6 K 4819 7 F 3792 8 C 3526 9 J 1250 10 V 937 11 I 845 12 Z 448 13 P 237 14 U 146 15 D 72 16 G 63 17 L 59 18 S 43 19 - 1
# New column "Vict_Descent_New" grouping by the 3 predominant classes and the rest grouped as "Other"
round(crime_data['Vict_Descent_New'].value_counts(normalize=True),3)*100
Latino and Hispanic 40.0 White 25.1 Black 18.6 Other 16.3 Name: Vict_Descent_New, dtype: float64
victim_descent_grouped = crime_data.groupby('Vict_Descent_New').size().sort_values(ascending=False).reset_index(name='COUNT')
victim_descent = alt.Chart(victim_descent_grouped , title='Victim Descent').mark_bar().encode(
alt.X('COUNT', title='', scale=alt.Scale(domain=[0,350000])),
alt.Y('Vict_Descent_New', sort='x', title=''),
alt.Color('Vict_Descent_New', legend=None)
)
victim_descent
crime_data['Victim_Age_New'].unique()
array(['Middle Adulthood(41-65)', 'Late Adolescence(18-21)',
'Early Adulthood(22-40)', 'Middle Childhood(6-13)',
'Early Adolescence(13-17)', 'Late Adulthood(>65)',
'Early Childhood(<5)'], dtype=object)
demographics = crime_data.groupby(['AREA NAME', 'Victim_Sex_New', 'Victim_Age_New', 'Vict_Descent_New']).size().reset_index(name='COUNT')
# Age and Sex
sorting_order = ['Early Childhood(<5)', 'Middle Childhood(6-13)', 'Early Adolescence(13-17)', 'Late Adolescence(18-21)',
'Early Adulthood(22-40)', 'Middle Adulthood(41-65)', 'Late Adulthood(>65)']
cat_dtype = pd.CategoricalDtype(categories=sorting_order, ordered=True)
demographics_sorted = demographics.astype({'Victim_Age_New': cat_dtype})
color_scale = alt.Scale(domain=['Male', 'Female', 'Unknown'], range=['#6baed6', '#ce6dbd', '#969696'])
alt.Chart(demographics_sorted, title="Number of crimes by victim age and victim sex").mark_bar().encode(
alt.Y('Victim_Age_New', title=None, sort='x'),
alt.X('sum(COUNT)', title=''), #axis=alt.Axis(format='k')
alt.Color('Victim_Sex_New', legend=None, scale=color_scale)
).facet(alt.Row('Victim_Sex_New', title=None),
title=alt.Title("Number of crimes by victim age and victim sex"))
# Age and Descent
(alt.Chart(demographics_sorted, title="Number of crimes by victim age and victim descent").mark_bar().encode(
alt.Y('Victim_Age_New', title=None, sort='x'),
alt.X('sum(COUNT)', title=''),
alt.Color('Vict_Descent_New', legend=None)
).facet(alt.Row('Vict_Descent_New', title=None),
title=alt.Title('Number of crimes by victim age and victim descent')
))
proportions = (
alt.Chart(demographics_sorted)
.mark_bar()
.encode(
alt.X('COUNT', stack='normalize', title='', sort='-x'), # Remove axis titles for legibility
alt.Y('AREA NAME', title=''),
alt.Color('Victim_Sex_New', title=None, scale=color_scale)
)
.properties(width=150, height=200)
.facet(
alt.Row('Victim_Age_New', title=None),
columns=2,
title=alt.Title("Proportion of crimes by area, victim age and victim sex")
)
.resolve_scale(x='independent')
)
frequencies = (alt.Chart(demographics_sorted).mark_bar().encode(
alt.X('COUNT', title='', sort='-x'),
#alt.X('COUNT', title='', sort='-x', scale=alt.Scale(domain=[0, 20000]))# rm axis titles for legibility
alt.Y('AREA NAME', title=''),
alt.Color('Victim_Sex_New', title=None, scale=color_scale))
.properties(width=150, height=200
).facet(
alt.Row('Victim_Age_New', title=None),
columns=2,
title=alt.Title("Number of crimes by area, victim age and victim sex")
)
.resolve_scale(x='independent')
)
proportions | frequencies
crime_cat_counts = crime_data['Crime Category'].value_counts()
crime_cat_counts
crime_cat_counts.plot(kind='barh', color='darkblue')
plt.title('Crime Category Distribution')
plt.xlabel('Count')
plt.show()
plt.show()
# Filtering out 'H'
df_crime_filtered = crime_data[crime_data['Vict Sex'].isin(['M', 'F', 'X'])]
crime_gender_counts = df_crime_filtered.groupby(['Crime Category', 'Vict Sex']).size().unstack().fillna(0)
fig, axes = plt.subplots(nrows=len(crime_gender_counts), ncols=1, figsize=(10, 5 * len(crime_gender_counts)))
# Iterate through each crime category and plot the corresponding donut chart
for i, (crime_category, counts) in enumerate(crime_gender_counts.iterrows()):
ax = axes[i]
# Create a donut plot using pie chart with a wedge set to white
wedges, texts, autotexts = ax.pie(counts, labels=['Female', 'Male', 'Unknown'], autopct='%1.1f%%',
startangle=90, colors=['pink', 'lightblue', 'gray'], wedgeprops=dict(width=0.3))
plt.setp(wedges, edgecolor='white')
ax.set_title(crime_category)
# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()
proportions2 = (
alt.Chart(demographics_sorted)
.mark_bar()
.encode(
alt.X('sum(COUNT):Q', stack='normalize', title='', sort='x'), # Remove axis titles for legibility
alt.Y('Vict_Descent_New:N', title=''),
alt.Color('Victim_Sex_New:N', title=None, scale=color_scale)
)
.properties(width=150, height=200)
.facet(
alt.Row('Victim_Age_New:O', title=None),
columns=2#,
#title=alt.TitleParams("Proportion of crimes by victim demographics", align='center')
)
.resolve_scale(x='independent')
)
frequencies2 = (alt.Chart(demographics_sorted).mark_bar().encode(
alt.X('sum(COUNT):Q', title='', sort='x'),
#alt.X('COUNT', title='', sort='-x', scale=alt.Scale(domain=[0, 20000]))# rm axis titles for legibility
alt.Y('Vict_Descent_New:N', title=''),
alt.Color('Victim_Sex_New:N', title=None, scale=color_scale))
.properties(width=150, height=200
).facet(
alt.Row('Victim_Age_New:O', title=None),
columns=2#,
#title=alt.Title("Number of crimes by victim demographics")
)
.resolve_scale(x='independent')
)
frequencies2 | proportions2
titleF = alt.TitleParams(
text='Crime Distribution Among Females',
subtitle='Analysis by Age Group and Racial/Ethnic Category')
female_demo = alt.Chart(demographics_sorted[demographics_sorted['Victim_Sex_New']=='Female'], title=titleF).mark_rect(
).encode(
alt.X('Vict_Descent_New:N', title=None, axis=alt.Axis(labelAngle=0, ticks=True)),
alt.Y('Victim_Age_New:O', title=None, sort=choices,axis=alt.Axis(ticks=True, labels=True)),
alt.Color('sum(COUNT):Q', title='', scale=alt.Scale(scheme='purplered')),
alt.Size('sum(COUNT):Q')
).properties(
width=300,
height=300
)
titleM = alt.TitleParams(
text='Crime Distribution Among Males',
subtitle='Analysis by Age Group and Racial/Ethnic Category')
male_demo = alt.Chart(demographics_sorted[demographics_sorted['Victim_Sex_New']=='Male'], title=titleM).mark_rect(
).encode(
alt.X('Vict_Descent_New:N', title=None, axis=alt.Axis(labelAngle=0, ticks=True)),
alt.Y('Victim_Age_New:O', title=None, sort=choices, axis=alt.Axis(ticks=True, labels=True)),
alt.Color('sum(COUNT):Q', title='', scale=alt.Scale(scheme='purpleblue')),
alt.Size('sum(COUNT):Q')
).properties(
width=300,
height=300
)
titleU = alt.TitleParams(
text='Crime Distribution Among Individuals with Undisclosed Gender',
subtitle='Analysis by Age Group and Racial/Ethnic Category')
unknown_demo = alt.Chart(demographics_sorted[demographics_sorted['Victim_Sex_New']=='Unknown'], title=titleU).mark_rect(
).encode(
alt.X('Vict_Descent_New:N', title=None, axis=alt.Axis(labelAngle=0, ticks=True)),
alt.Y('Victim_Age_New:O', title=None, sort=choices, axis=alt.Axis(ticks=True, labels=True)),
alt.Color('sum(COUNT):Q', title='', scale=alt.Scale(scheme='greys')),
alt.Size('sum(COUNT):Q')
).properties(
width=300,
height=300
)
female_demo
male_demo
unknown_demo
df_crime_2020 = crime_data[crime_data['Year'] == 2020]
df_crime_2021 = crime_data[crime_data['Year'] == 2021]
df_crime_2022 = crime_data[crime_data['Year'] == 2022]
df_crime_2023 = crime_data[crime_data['Year'] == 2023]
import folium
from folium.plugins import HeatMap
# Function to create and save heatmap for a given year
def create_and_save_heatmap(df, year):
# Create a base map centered around LA
la_map = folium.Map(location=[34.0522, -118.2437], zoom_start=12)
# Convert the DataFrame to a list of tuples (latitude, longitude)
heat_data = [[row['LAT'], row['LON']] for _, row in df.iterrows()]
# Create heatmap for the current year
HeatMap(heat_data, radius=15, blur=1, gradient={0.4: 'blue', 0.8: 'lime', 1: 'red'},
min_opacity=0.01, name=f'Crime Heatmap {year}').add_to(la_map)
# Save the map
la_map.save(f"crime_heatmap_{year}.html")
# Create and save heatmaps for each year
for year, df_year in zip([2020, 2021, 2022, 2023], [df_crime_2020, df_crime_2021, df_crime_2022, df_crime_2023]):
create_and_save_heatmap(df_year, year)
print("crime_heatmap_* will be saved to your local file")
crime_heatmap_* will be saved to your local file
# Across day and night
mean_value = crime_data['Time Category'].value_counts().mean()
plt.figure(figsize=(6, 2))
crime_data['Time Category'].value_counts().plot(kind='bar',color='black',label="crime distribution")
plt.axhline(y=mean_value, color='red', linestyle='--', label='Mean')
plt.title('Distribution of Crime across day and night')
plt.ylabel('Number of Crimes')
plt.legend()
plt.show()
#Detail into day
crime_data['TIME_OCCURRENCE'] = pd.to_datetime(crime_data['TIME_OCCURRENCE'])
plt.figure(figsize=(12, 3))
time_bins = pd.cut(crime_data['TIME_OCCURRENCE'].dt.hour, bins=range(0, 25, 1), right=False)
time_bins.value_counts().sort_index().plot(kind='bar', color='black',label="Crime distribution")
mean_value = time_bins.value_counts().mean()
plt.axhline(y=mean_value, color='red', linestyle='--', label='Mean')
plt.title('Distribution of crime occurred across different time period')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.show()
chart = alt.Chart(crime_data).mark_bar(color='black').encode(
#x=alt.X('time occurance bins', axis=alt.Axis(title='Time',labelAngle=90), sort=sort_bin),
x=alt.X('time occurance bins', axis=alt.Axis(title='Time',labelAngle=90)),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency', format='s')),
).properties(height =100, width = 250, title="Distribution of crime occurred across different time period by year").facet(facet='Date Occured Year:N',columns=3)
chart
shortchart = alt.Chart(crime_data).mark_bar(color='black').encode(
x=alt.X('time occurance bins h', axis=alt.Axis(title='24 hours time frame',labelAngle=0), scale=alt.Scale(domain=[1,24])),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency', format='s')),
).properties(height =100, width = 250, title="\t\tDistribution of crime occurred across different time period by year").facet(facet='Date Occured Year:N',columns=3).properties(title="Frequency of Crime Occurred break down by 24 hours time frame").configure_title(
align='center'
)
crime_data['Date Occured Day']
shortchart
shortchart = alt.Chart(crime_data).mark_bar(color='black').encode(
y=alt.Y('Date Occured Day', axis=alt.Axis(title='days')),
x=alt.X('count():Q', axis=alt.Axis(title='Number of Crimes', format='s'),sort='-y'),
).properties(height =100, width = 250, title="\t\tDistribution of crime occurred across different days by year").facet(facet='Date Occured Year:N',columns=3).properties(title="Frequency of Crime Occurred break down by 24 hours time frame").configure_title(
align='center'
)
shortchart
# shortchart = alt.Chart(crime_data).mark_bar(color='black').encode(
# x=alt.X('Date Occured Day', axis=alt.Axis(title='days',labelAngle=90)),
# y=alt.Y('count():Q', axis=alt.Axis(title='Number of Crimes', format='s')),
# )
# shortchart
#crime_data['Date Occured Day']
#crime_data['Time Category']
# Calculate value counts of 'Time Category'
time_category_counts = crime_data['Time Category'].value_counts().reset_index()
time_category_counts.columns = ['Time Category', 'Count']
mean_value = time_category_counts['Count'].mean()
# Create Altair bar chart
bar_chart = alt.Chart(time_category_counts).mark_bar(color='black').encode(
x='Time Category',
y=alt.Y('Count',title="Number of Crimes")
).properties(
title='Distribution of Crime across day and night',
width=200,
height=100
)
mean_line = alt.Chart(pd.DataFrame({'mean_value': [mean_value]})).mark_rule(color='red', strokeDash=[2,2], strokeWidth=3).encode(
y='mean_value:Q'
)
bar_chart + mean_line
TimeCategorySplit = crime_data['Time Category'].value_counts()
print("\nNumber of crimes on day and night:")
print(TimeCategorySplit)
print("\nPercentage of crimes on day and night:")
print(TimeCategorySplit/TimeCategorySplit.sum()*100)
TimeCategorySafe = TimeCategorySplit.tail(1)
TimeCategoryUnsafe = TimeCategorySplit.head(1)
#############################
TimeHourSplit = time_bins.value_counts()
print("\nPercentage of crimes on each hour of the day:")
print(TimeHourSplit/TimeHourSplit.sum()*100)
print("\nCrime Count mean:")
mean_value = TimeHourSplit.mean()
print(mean_value)
TimeHourSafe = TimeHourSplit.tail(3)
TimeHourUnsafe = TimeHourSplit.head(3)
#############################
print("\nHours which has Crime over the mean and respective count:")
time_bin_greater_than_bin = time_bins.value_counts()[time_bins.value_counts()>mean_value]
print(time_bin_greater_than_bin.index.to_list())
print("\nHours which has Crime over the mean and respective percentage:")
print(time_bin_greater_than_bin.value_counts()/time_bins.value_counts().sum()*100)
#############################
print("\nPercentage of crimes on each month of the year:")
MonthSplit = crime_data['Date Occured Month'].value_counts()
print(MonthSplit)
print(MonthSplit/MonthSplit.sum()*100)
print(crime_data.groupby('Date Occured Year')['Date Occured Month'].value_counts()/crime_data['Date Occured Month'].value_counts().sum()*100)
MonthSplitSafe = MonthSplit.tail(3)
MonthSplitUnsafe = MonthSplit.head(3)
#############################
DaySplit = crime_data['Date Occured Day'].value_counts()
print("\nNumber of crimes on days")
print(DaySplit)
print("\nPercentage of crimes on days")
print(DaySplit/DaySplit.sum()*100)
DaySafe = DaySplit.tail(2)
DayUnsafe = DaySplit.head(2)
print("\nNumber of crimes on days")
print(crime_data.groupby('Date Occured Year')['Date Occured Day'].value_counts())
print("\nPercentage of crimes on days")
print(crime_data.groupby('Date Occured Year')['Date Occured Day'].value_counts()/crime_data['Date Occured Day'].value_counts().sum()*100)
Number of crimes on day and night:
Daytime 400057
Nighttime 266785
Name: Time Category, dtype: int64
Percentage of crimes on day and night:
Daytime 59.992772
Nighttime 40.007228
Name: Time Category, dtype: float64
Percentage of crimes on each hour of the day:
[12, 13) 7.152669
[18, 19) 5.641816
[17, 18) 5.574784
[20, 21) 5.387783
[19, 20) 5.237972
[15, 16) 5.227325
[16, 17) 5.182487
[14, 15) 4.939251
[21, 22) 4.928004
[22, 23) 4.729606
[13, 14) 4.692716
[11, 12) 4.596141
[10, 11) 4.508114
[0, 1) 4.353505
[23, 24) 4.152258
[8, 9) 3.796851
[9, 10) 3.764610
[1, 2) 3.109282
[7, 8) 2.609014
[2, 3) 2.529685
[6, 7) 2.306993
[3, 4) 2.155233
[4, 5) 1.777183
[5, 6) 1.646717
Name: TIME_OCCURRENCE, dtype: float64
Crime Count mean:
27785.083333333332
Hours which has Crime over the mean and respective count:
[Interval(12, 13, closed='left'), Interval(18, 19, closed='left'), Interval(17, 18, closed='left'), Interval(20, 21, closed='left'), Interval(19, 20, closed='left'), Interval(15, 16, closed='left'), Interval(16, 17, closed='left'), Interval(14, 15, closed='left'), Interval(21, 22, closed='left'), Interval(22, 23, closed='left'), Interval(13, 14, closed='left'), Interval(11, 12, closed='left'), Interval(10, 11, closed='left'), Interval(0, 1, closed='left')]
Hours which has Crime over the mean and respective percentage:
47697 0.00015
37622 0.00015
37175 0.00015
35928 0.00015
34929 0.00015
34858 0.00015
34559 0.00015
32937 0.00015
32862 0.00015
31539 0.00015
31293 0.00015
30649 0.00015
30062 0.00015
29031 0.00015
Name: TIME_OCCURRENCE, dtype: float64
Percentage of crimes on each month of the year:
1 67625
7 56811
10 56545
8 56299
6 54739
5 54714
9 54299
3 53691
2 53474
12 53461
11 52746
4 52438
Name: Date Occured Month, dtype: int64
1 10.141083
7 8.519409
10 8.479520
8 8.442630
6 8.208691
5 8.204942
9 8.142708
3 8.051532
2 8.018991
12 8.017042
11 7.909820
4 7.863632
Name: Date Occured Month, dtype: float64
Date Occured Year Date Occured Month
2020 1 2.205920
2 2.051311
7 1.911847
8 1.904199
5 1.881855
6 1.873157
3 1.852013
10 1.849614
9 1.792029
12 1.766985
11 1.763236
4 1.737743
2021 10 2.165580
7 2.161981
8 2.087001
9 2.084452
11 2.062857
12 1.989827
6 1.959835
5 1.942139
3 1.849014
1 1.848414
4 1.821421
2 1.705052
2022 10 2.337885
6 2.325438
5 2.316141
12 2.300095
8 2.299945
7 2.274452
4 2.259906
3 2.247459
9 2.217767
11 2.119393
1 2.067956
2 2.020569
2023 1 2.235012
7 2.171129
8 2.151484
10 2.126441
3 2.103047
2 2.081603
5 2.064807
6 2.050261
9 2.048461
4 2.044562
11 1.964333
12 1.960134
2024 1 1.783781
2 0.160458
Name: Date Occured Month, dtype: float64
Number of crimes on days
Friday 100515
Saturday 98742
Monday 94364
Wednesday 94249
Sunday 94164
Thursday 93568
Tuesday 91240
Name: Date Occured Day, dtype: int64
Percentage of crimes on days
Friday 15.073286
Saturday 14.807406
Monday 14.150878
Wednesday 14.133633
Sunday 14.120886
Thursday 14.031510
Tuesday 13.682402
Name: Date Occured Day, dtype: float64
Number of crimes on days
Date Occured Year Date Occured Day
2020 Friday 22309
Wednesday 21937
Saturday 21921
Thursday 21355
Monday 21333
Sunday 20942
Tuesday 20842
2021 Friday 23956
Monday 23103
Saturday 23023
Sunday 22478
Wednesday 22032
Thursday 21921
Tuesday 21379
2022 Friday 27384
Saturday 27017
Thursday 25190
Wednesday 25031
Sunday 24992
Monday 24575
Tuesday 24438
2023 Friday 24928
Saturday 24775
Sunday 24080
Monday 23455
Wednesday 23396
Thursday 23345
Tuesday 22740
2024 Saturday 2006
Friday 1938
Monday 1898
Wednesday 1853
Tuesday 1841
Thursday 1757
Sunday 1672
Name: Date Occured Day, dtype: int64
Percentage of crimes on days
Date Occured Year Date Occured Day
2020 Friday 3.345470
Wednesday 3.289685
Saturday 3.287285
Thursday 3.202408
Monday 3.199109
Sunday 3.140474
Tuesday 3.125478
2021 Friday 3.592455
Monday 3.464539
Saturday 3.452542
Sunday 3.370813
Wednesday 3.303931
Thursday 3.287285
Tuesday 3.206007
2022 Friday 4.106520
Saturday 4.051484
Thursday 3.777507
Wednesday 3.753663
Sunday 3.747814
Monday 3.685281
Tuesday 3.664736
2023 Friday 3.738217
Saturday 3.715273
Sunday 3.611050
Monday 3.517325
Wednesday 3.508477
Thursday 3.500829
Tuesday 3.410103
2024 Saturday 0.300821
Friday 0.290624
Monday 0.284625
Wednesday 0.277877
Tuesday 0.276077
Thursday 0.263481
Sunday 0.250734
Name: Date Occured Day, dtype: float64
TimeHourSafe = ','.join(map(str, TimeHourSafe.reset_index()['index']))
TimeHourUnsafe = ','.join(map(str, TimeHourUnsafe.reset_index()['index']))
TimeCategorySafe = ','.join(map(str, TimeCategorySafe.reset_index()['index']))
TimeCategoryUnsafe = ','.join(map(str, TimeCategoryUnsafe.reset_index()['index']))
DaySafe = ','.join(map(str, DaySafe.reset_index()['index']))
DayUnsafe = ','.join(map(str, DayUnsafe.reset_index()['index']))
MonthSafe = ','.join(map(str, MonthSplitSafe.reset_index()['index']))
MonthUnsafe = ','.join(map(str, MonthSplitUnsafe.reset_index()['index']))
data = {
'hour': [TimeHourSafe, TimeHourUnsafe],
'light/dark': [TimeCategorySafe, TimeCategoryUnsafe],
'days': [DaySafe, DayUnsafe],
'month':[MonthSafe,MonthUnsafe]
}
df = pd.DataFrame(data, index=['safe', 'unsafe'])
df
| hour | light/dark | days | month | |
|---|---|---|---|---|
| safe | [3, 4),[4, 5),[5, 6) | Nighttime | Thursday,Tuesday | 12,11,4 |
| unsafe | [12, 13),[18, 19),[17, 18) | Daytime | Friday,Saturday | 1,7,10 |
# Checking the overlap of Date Occurance and Date Reported
date_counts_occ = crime_data['DATE OCC'].value_counts().sort_index().reset_index()
date_counts_occ.columns = ['Date', 'Frequency']
date_counts_rptd = crime_data['Date Rptd'].value_counts().sort_index().reset_index()
date_counts_rptd.columns = ['Date', 'Frequency']
chart_occ = alt.Chart(date_counts_occ).mark_circle(color='green').encode(
x=alt.X('Date:T', axis=alt.Axis(title='Date Occured')),
y='Frequency:Q'
).properties(height=300, width=1000)
chart_rptd = alt.Chart(date_counts_rptd).mark_circle(color='black').encode(
x=alt.X('Date:T', axis=alt.Axis(title='Date Reported')),
y='Frequency:Q',
)
scat = (chart_occ + chart_rptd).properties(title= alt.TitleParams("Count of crime occured and reported across time", subtitle=['green:Date Occured black:Date Reported']))
# Distribution of crime in a day
hist = alt.Chart(crime_data).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Day')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title= "Distribution of reporting frequency across day")
# Distribution of crime in hr
hist_hr = alt.Chart(crime_data).mark_bar(color="black").encode(
x=alt.X('response_rate_in_hour:O', axis=alt.Axis(title='Day')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title= "Distribution of reporting frequency across day")
scat & hist & hist_hr
#Yearly distribution
data_2020 = crime_data[crime_data['Date Occured Year']==2020]
data_2021 = crime_data[crime_data['Date Occured Year']==2021]
data_2022 = crime_data[crime_data['Date Occured Year']==2022]
data_2023 = crime_data[crime_data['Date Occured Year']==2023]
data_2024 = crime_data[crime_data['Date Occured Year']==2024]
hist_2020 =alt.Chart(data_2020).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title="2020 response rate")
hist_2021 =alt.Chart(data_2021).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title="2021 response rate")
hist_2022 =alt.Chart(data_2022).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title="2022 response rate")
hist_2023 =alt.Chart(data_2023).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title="2023 response rate")
hist_2024 =alt.Chart(data_2024).mark_bar(color="black").encode(
x=alt.X('response_rate_in_day:O', axis=alt.Axis(title='Month')),
y=alt.Y('count():Q', axis=alt.Axis(title='Frequency'))
).properties(title="2024 response rate")
hist_2020 & hist_2021 & hist_2022 & hist_2023 & hist_2024
print("Responses by area name:")
print(crime_data.groupby('AREA NAME')['response_rate_in_day'].sum().sort_values(ascending=True))
print("\nResponses by Date Reported Year:")
print(crime_data.groupby('Date Rptd Year')['response_rate_in_day'].sum().sort_values(ascending=True))
print("\nResponses by Date Reported Month:")
print(crime_data.groupby('Date Rptd Month')['response_rate_in_day'].sum().sort_values(ascending=True))
print("\nResponses by Victim age group:")
print(crime_data.groupby('Victim_Age_New')['response_rate_in_day'].sum().sort_values(ascending=True))
print("\nResponses by Crime Category:")
print(crime_data.groupby('Crime Category')['response_rate_in_day'].sum().sort_values(ascending=True))
print("Percentage Responses by area name:")
print((crime_data.groupby('AREA NAME')['response_rate_in_day'].sum()/crime_data.groupby('AREA NAME')['response_rate_in_day'].count()).sort_values(ascending=True))
print("\nPercentage Responses by Date Reported Year:")
print((crime_data.groupby('Date Rptd Year')['response_rate_in_day'].sum()/crime_data.groupby('Date Rptd Year')['response_rate_in_day'].count()).sort_values(ascending=True))
print("\nPercentage Responses by Date Reported Month:")
print((crime_data.groupby('Date Rptd Month')['response_rate_in_day'].sum()/crime_data.groupby('Date Rptd Month')['response_rate_in_day'].count()).sort_values(ascending=True))
print("\nPercentage Responses by Victim age group:")
print((crime_data.groupby('Victim_Age_New')['response_rate_in_day'].sum()/crime_data.groupby('Victim_Age_New')['response_rate_in_day'].count()).sort_values(ascending=True))
print("\nPercentage Responses by Crime Category:")
print((crime_data.groupby('Crime Category')['response_rate_in_day'].sum()/crime_data.groupby('Crime Category')['response_rate_in_day'].count()).sort_values(ascending=True))
Responses by area name: AREA NAME Hollenbeck 222637 Harbor 260576 Newton 307119 Rampart 331188 Foothill 334039 Southeast 340103 Northeast 362720 Mission 382091 Central 397330 Pacific 401488 Olympic 402740 Wilshire 416810 Hollywood 424014 West Valley 439160 West LA 440777 Van Nuys 445055 Topanga 454295 Southwest 456410 Devonshire 466560 N Hollywood 482369 77th Street 488393 Name: response_rate_in_day, dtype: int64 Responses by Date Reported Year: Date Rptd Year 2024 330890 2020 707249 2021 1695164 2022 2500723 2023 3021848 Name: response_rate_in_day, dtype: int64 Responses by Date Reported Month: Date Rptd Month 7 601422 4 628791 9 636088 2 642351 3 649208 8 651130 6 663193 5 681467 11 702412 10 708080 12 760772 1 930960 Name: response_rate_in_day, dtype: int64 Responses by Victim age group: Victim_Age_New Early Childhood(<5) 69815 Middle Childhood(6-13) 364660 Late Adolescence(18-21) 377985 Early Adolescence(13-17) 471261 Late Adulthood(>65) 695749 Middle Adulthood(41-65) 2730522 Early Adulthood(22-40) 3545882 Name: response_rate_in_day, dtype: int64 Responses by Crime Category: Crime Category Fraud and White-Collar Crimes 202979 Vandalism and Property Damage 281951 Other 726889 Assault and Violence 813556 Sexual Offenses 840694 Theft and Robbery 5389805 Name: response_rate_in_day, dtype: int64 Percentage Responses by area name: AREA NAME Central 8.391341 Newton 9.883154 Hollenbeck 9.928071 Southeast 10.068176 Harbor 10.321068 Pacific 10.753375 Southwest 10.815147 Rampart 10.820662 77th Street 11.527403 Hollywood 11.994399 Olympic 12.217200 Wilshire 12.926345 Northeast 13.479505 Mission 14.088901 West LA 14.518825 Topanga 14.678352 Van Nuys 14.835167 N Hollywood 14.957642 Foothill 14.964564 West Valley 15.724158 Devonshire 17.907423 Name: response_rate_in_day, dtype: float64 Percentage Responses by Date Reported Year: Date Rptd Year 2020 4.883912 2021 10.815961 2022 13.955005 2023 17.677622 2024 21.825078 Name: response_rate_in_day, dtype: float64 Percentage Responses by Date Reported Month: Date Rptd Month 7 10.615140 8 11.492693 9 11.693654 2 12.005214 4 12.046959 6 12.081999 3 12.139721 5 12.448932 10 12.606915 11 13.196040 1 13.875252 12 14.132338 Name: response_rate_in_day, dtype: float64 Percentage Responses by Victim age group: Victim_Age_New Late Adolescence(18-21) 9.760245 Early Adulthood(22-40) 10.813484 Middle Adulthood(41-65) 11.802202 Late Adulthood(>65) 15.371252 Early Adolescence(13-17) 29.562825 Early Childhood(<5) 37.294338 Middle Childhood(6-13) 63.210262 Name: response_rate_in_day, dtype: float64 Percentage Responses by Crime Category: Crime Category Assault and Violence 4.321541 Vandalism and Property Damage 4.658114 Other 9.313358 Theft and Robbery 16.840036 Sexual Offenses 51.252454 Fraud and White-Collar Crimes 57.225543 Name: response_rate_in_day, dtype: float64
response_area_list = (crime_data.groupby('AREA NAME')['response_rate_in_day'].sum()/crime_data.groupby('AREA NAME')['response_rate_in_day'].count()).sort_values(ascending=True)
response_crime_list = (crime_data.groupby('Crime Category')['response_rate_in_day'].sum()/crime_data.groupby('Crime Category')['response_rate_in_day'].count()).sort_values(ascending=True)
response_victim_list = (crime_data.groupby('Victim_Age_New')['response_rate_in_day'].sum()/crime_data.groupby('Victim_Age_New')['response_rate_in_day'].count()).sort_values(ascending=True)
# print(response_area_list)
# print(response_crime_list)
# print(response_victim_list)
areaSafe = ','.join(map(str, response_area_list.head(3).reset_index()['AREA NAME']))
areaUnsafe = ','.join(map(str, response_area_list.tail(3).reset_index()['AREA NAME']))
crimeSafe = ','.join(map(str, response_crime_list.head(3).reset_index()['Crime Category']))
crimeUnsafe = ','.join(map(str, response_crime_list.tail(3).reset_index()['Crime Category']))
victimSafe = ','.join(map(str, response_victim_list.head(3).reset_index()['Victim_Age_New']))
VictimUnsafe = ','.join(map(str, response_victim_list.tail(3).reset_index()['Victim_Age_New']))
data = {
#'Area': [areaSafe, areaUnsafe],
'Crime': [crimeSafe, crimeUnsafe],
#'Victim': [victimSafe, VictimUnsafe]
}
df = pd.DataFrame(data, index=['Responsive', 'Less Responsive'])
pd.set_option('display.max_columns', None) # To display all columns
pd.set_option('display.max_rows', None) # To display all rows
pd.set_option('display.width', None)
print(df)
print("========")
print(crimeSafe)
print("========")
print(crimeUnsafe)
print("========")
print(victimSafe)
print("========")
print(VictimUnsafe)
Crime Responsive Assault and Violence,Vandalism and Property Da... Less Responsive Theft and Robbery,Sexual Offenses,Fraud and Wh... ======== Assault and Violence,Vandalism and Property Damage,Other ======== Theft and Robbery,Sexual Offenses,Fraud and White-Collar Crimes ======== Late Adolescence(18-21),Early Adulthood(22-40),Middle Adulthood(41-65) ======== Early Adolescence(13-17),Early Childhood(<5),Middle Childhood(6-13)
response_rate_by_age = (crime_data.groupby('Victim_Age_New')['response_rate_in_day'].sum() / crime_data.groupby('Victim_Age_New')['response_rate_in_day'].count()).reset_index()
response_rate_by_age.columns = ['Victim_Age_New', 'Response_Rate']
chart = alt.Chart(response_rate_by_age).mark_line(point=True,color='black').encode(
x=alt.X('Victim_Age_New:N', title='Age group',sort=choices),
y=alt.Y('Response_Rate:Q', title='Response rate'),
tooltip=['Victim_Age_New:N', 'Response_Rate:Q']
).properties(
width=600,
height=300,
title='Response rate by Age group'
).configure_axis(
labelAngle=45
)
chart
counts = (crime_data.groupby('Date Rptd Year')['response_rate_in_day'].sum() / crime_data.groupby('Date Rptd Year')['response_rate_in_day'].count()).sort_index()
counts_df = pd.DataFrame({'Year': counts.index, 'Response Rate': counts.values})
chart2 = alt.Chart(counts_df).mark_line(point=True, color='black').encode(
x='Year:O',
y='Response Rate:Q',
tooltip=['Year:O', 'Response Rate:Q']
).properties(
title='Response rate by Year',
width=600,
height=300
).configure_axis(
labelFontSize=12,
titleFontSize=14
)
chart2
# Fitting the distribution
crime_data1 = crime_data[crime_data['response_rate_in_day']<=24]
x_poisson = np.arange(0, int(crime_data1['response_rate_in_day'].max())) #np.linspace(crime_data['response_rate_in_day'].min(), crime_data['response_rate_in_day'].max(), 100) #
params_exp = expon.fit(crime_data1['response_rate_in_day'])
fitted_data_exp = expon.pdf(x_poisson, *params_exp)
params_gamma = gamma.fit(crime_data1['response_rate_in_day'])
fitted_data_gamma = gamma.pdf(x_poisson, *params_gamma)
mean_poissons = crime_data1['response_rate_in_day'].mean()
fitted_data_poisson = poisson.pmf(x_poisson, mean_poissons)
# Plotting
plt.hist(crime_data1['response_rate_in_day'], bins=24, density=True, color='black', alpha=0.7, label='response rate')
plt.plot(x_poisson, fitted_data_exp, linewidth=2, label='Exponential')
#plt.plot(x_poisson, fitted_data_gamma, linewidth=2, label='Gamma')
plt.plot(x_poisson, fitted_data_poisson, linewidth=2, label='Poisson')
plt.title('Response Time Distribution: Duration Between Crime Occurrence and Reporting')
plt.xlabel('Days')
plt.ylabel('Response rate')
plt.legend()
plt.show()
# Finding propability of responding in 1 day and not responding in 1 day
params_exp = expon.fit(crime_data['response_rate_in_day'])
fitted_data_exp = expon.pdf(x_poisson, *params_exp)
cdf_at_1 = expon.cdf(1, *params_exp)
cdf_at_1
0.07759586188986314
#Crime and reporting probability
crime_and_prob = {}
crime_types = ['Theft and Robbery', 'Assault and Violence', 'Vandalism and Property Damage', 'Sexual Offenses', 'Other','Fraud and White-Collar Crimes']
for crime in crime_types:
filtered_data = crime_data[crime_data['Crime Category']==crime]
params_exp = expon.fit(filtered_data['response_rate_in_day'])
fitted_data_exp = expon.pdf(x_poisson, *params_exp)
cdf_at_1 = expon.cdf(1, *params_exp)
crime_and_prob[crime] = {}
crime_and_prob[crime]["Probability of People reporting a crime less than 1 day"] = cdf_at_1
crime_and_prob[crime]["Probability of People reporting a crime more than 1 day"] = 1-cdf_at_1
print(crime," : ",cdf_at_1)
pd.DataFrame.from_dict(crime_and_prob)
Theft and Robbery : 0.05765355570848174 Assault and Violence : 0.2065771291038082 Vandalism and Property Damage : 0.19319973655911718 Sexual Offenses : 0.019322148222487758 Other : 0.10180910246292221 Fraud and White-Collar Crimes : 0.017322916807033562
| Theft and Robbery | Assault and Violence | Vandalism and Property Damage | Sexual Offenses | Other | Fraud and White-Collar Crimes | |
|---|---|---|---|---|---|---|
| Probability of People reporting a crime less than 1 day | 0.057654 | 0.206577 | 0.1932 | 0.019322 | 0.101809 | 0.017323 |
| Probability of People reporting a crime more than 1 day | 0.942346 | 0.793423 | 0.8068 | 0.980678 | 0.898191 | 0.982677 |
import pandas as pd
from scipy.stats import expon
crime_and_prob = {}
crime_types = ['Theft and Robbery', 'Assault and Violence', 'Vandalism and Property Damage', 'Sexual Offenses', 'Other', 'Fraud and White-Collar Crimes']
areas = ['Central', '77th Street', 'Southwest']
for area in areas:
crime_and_prob[area] = {}
for crime in crime_types:
filtered_data = crime_data[(crime_data['Crime Category'] == crime) & (crime_data['AREA NAME'] == area)]
params_exp = expon.fit(filtered_data['response_rate_in_day'])
cdf_at_1 = expon.cdf(1, *params_exp)
crime_and_prob[area][crime] = {}
crime_and_prob[area][crime]["Probability of People reporting a crime less than 1 day"] = cdf_at_1
crime_and_prob[area][crime]["Probability of People reporting a crime more than 1 day"] = 1 - cdf_at_1
#print(area, crime, " : ", cdf_at_1)
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict({(i,j): crime_and_prob[i][j]
for i in crime_and_prob.keys()
for j in crime_and_prob[i].keys()}).transpose()
df
| Probability of People reporting a crime less than 1 day | Probability of People reporting a crime more than 1 day | ||
|---|---|---|---|
| Central | Theft and Robbery | 0.083601 | 0.916399 |
| Assault and Violence | 0.312763 | 0.687237 | |
| Vandalism and Property Damage | 0.214548 | 0.785452 | |
| Sexual Offenses | 0.025438 | 0.974562 | |
| Other | 0.196652 | 0.803348 | |
| Fraud and White-Collar Crimes | 0.019906 | 0.980094 | |
| 77th Street | Theft and Robbery | 0.046834 | 0.953166 |
| Assault and Violence | 0.310085 | 0.689915 | |
| Vandalism and Property Damage | 0.303747 | 0.696253 | |
| Sexual Offenses | 0.019287 | 0.980713 | |
| Other | 0.117051 | 0.882949 | |
| Fraud and White-Collar Crimes | 0.014873 | 0.985127 | |
| Southwest | Theft and Robbery | 0.061745 | 0.938255 |
| Assault and Violence | 0.286378 | 0.713622 | |
| Vandalism and Property Damage | 0.240503 | 0.759497 | |
| Sexual Offenses | 0.017923 | 0.982077 | |
| Other | 0.128236 | 0.871764 | |
| Fraud and White-Collar Crimes | 0.022941 | 0.977059 |
crime_data['OccDateTimeCombined'] = pd.to_datetime(crime_data['OccDateTimeCombined'])
data_crime = crime_data['OccDateTimeCombined'].value_counts().sort_index()
data_crime = pd.DataFrame(data_crime)
data_crime.index = pd.to_datetime(data_crime.index)
data_crime_resampled = data_crime.resample('H').sum() # Resample without specifying 'on'
freq = 'H'
sm.tsa.adfuller(data_crime_resampled)
(-15.336003986770589,
3.8881302311215366e-28,
53,
35865,
{'1%': -3.430532344014907,
'5%': -2.8616205916097854,
'10%': -2.566812896370319},
235261.80709925527)
fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(data_crime_resampled, lags=20, ax=ax1, alpha=0.85, color='black')
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(data_crime_resampled, lags=20, ax=ax2, alpha=0.05, color='black')
data_crime_resampled_train = data_crime_resampled.head(35000)
data_crime_resampled_test = data_crime_resampled.tail(919)
res = STL(data_crime_resampled_train['OccDateTimeCombined'], period=24).fit()
res.plot()
plt.show()
arima_res = sm.tsa.ARIMA(res.resid + res.trend, order=(4,0,1)).fit()
steps = len(data_crime_resampled_test)
arima_res.forecast(steps=steps)
2023-12-29 08:00:00 15.828699 2023-12-29 09:00:00 15.915219 2023-12-29 10:00:00 16.072477 2023-12-29 11:00:00 16.115033 2023-12-29 12:00:00 16.184526 2023-12-29 13:00:00 16.251216 2023-12-29 14:00:00 16.317949 2023-12-29 15:00:00 16.382439 2023-12-29 16:00:00 16.445175 2023-12-29 17:00:00 16.506162 2023-12-29 18:00:00 16.565487 2023-12-29 19:00:00 16.623187 2023-12-29 20:00:00 16.679310 2023-12-29 21:00:00 16.733896 2023-12-29 22:00:00 16.786989 2023-12-29 23:00:00 16.838628 2023-12-30 00:00:00 16.888855 2023-12-30 01:00:00 16.937707 2023-12-30 02:00:00 16.985222 2023-12-30 03:00:00 17.031437 2023-12-30 04:00:00 17.076387 2023-12-30 05:00:00 17.120106 2023-12-30 06:00:00 17.162630 2023-12-30 07:00:00 17.203990 2023-12-30 08:00:00 17.244217 2023-12-30 09:00:00 17.283344 2023-12-30 10:00:00 17.321401 2023-12-30 11:00:00 17.358415 2023-12-30 12:00:00 17.394417 2023-12-30 13:00:00 17.429434 2023-12-30 14:00:00 17.463492 2023-12-30 15:00:00 17.496618 2023-12-30 16:00:00 17.528838 2023-12-30 17:00:00 17.560176 2023-12-30 18:00:00 17.590656 2023-12-30 19:00:00 17.620303 2023-12-30 20:00:00 17.649138 2023-12-30 21:00:00 17.677183 2023-12-30 22:00:00 17.704462 2023-12-30 23:00:00 17.730993 2023-12-31 00:00:00 17.756799 2023-12-31 01:00:00 17.781899 2023-12-31 02:00:00 17.806311 2023-12-31 03:00:00 17.830056 2023-12-31 04:00:00 17.853151 2023-12-31 05:00:00 17.875613 2023-12-31 06:00:00 17.897461 2023-12-31 07:00:00 17.918712 2023-12-31 08:00:00 17.939380 2023-12-31 09:00:00 17.959483 2023-12-31 10:00:00 17.979036 2023-12-31 11:00:00 17.998054 2023-12-31 12:00:00 18.016551 2023-12-31 13:00:00 18.034542 2023-12-31 14:00:00 18.052041 2023-12-31 15:00:00 18.069061 2023-12-31 16:00:00 18.085615 2023-12-31 17:00:00 18.101716 2023-12-31 18:00:00 18.117376 2023-12-31 19:00:00 18.132608 2023-12-31 20:00:00 18.147423 2023-12-31 21:00:00 18.161833 2023-12-31 22:00:00 18.175848 2023-12-31 23:00:00 18.189480 2024-01-01 00:00:00 18.202738 2024-01-01 01:00:00 18.215634 2024-01-01 02:00:00 18.228177 2024-01-01 03:00:00 18.240377 2024-01-01 04:00:00 18.252242 2024-01-01 05:00:00 18.263784 2024-01-01 06:00:00 18.275009 2024-01-01 07:00:00 18.285927 2024-01-01 08:00:00 18.296546 2024-01-01 09:00:00 18.306875 2024-01-01 10:00:00 18.316921 2024-01-01 11:00:00 18.326692 2024-01-01 12:00:00 18.336195 2024-01-01 13:00:00 18.345439 2024-01-01 14:00:00 18.354430 2024-01-01 15:00:00 18.363174 2024-01-01 16:00:00 18.371680 2024-01-01 17:00:00 18.379952 2024-01-01 18:00:00 18.387998 2024-01-01 19:00:00 18.395824 2024-01-01 20:00:00 18.403436 2024-01-01 21:00:00 18.410839 2024-01-01 22:00:00 18.418040 2024-01-01 23:00:00 18.425044 2024-01-02 00:00:00 18.431856 2024-01-02 01:00:00 18.438482 2024-01-02 02:00:00 18.444926 2024-01-02 03:00:00 18.451194 2024-01-02 04:00:00 18.457291 2024-01-02 05:00:00 18.463221 2024-01-02 06:00:00 18.468988 2024-01-02 07:00:00 18.474597 2024-01-02 08:00:00 18.480054 2024-01-02 09:00:00 18.485360 2024-01-02 10:00:00 18.490522 2024-01-02 11:00:00 18.495542 2024-01-02 12:00:00 18.500425 2024-01-02 13:00:00 18.505174 2024-01-02 14:00:00 18.509793 2024-01-02 15:00:00 18.514286 2024-01-02 16:00:00 18.518656 2024-01-02 17:00:00 18.522907 2024-01-02 18:00:00 18.527041 2024-01-02 19:00:00 18.531061 2024-01-02 20:00:00 18.534972 2024-01-02 21:00:00 18.538776 2024-01-02 22:00:00 18.542476 2024-01-02 23:00:00 18.546074 2024-01-03 00:00:00 18.549574 2024-01-03 01:00:00 18.552978 2024-01-03 02:00:00 18.556289 2024-01-03 03:00:00 18.559510 2024-01-03 04:00:00 18.562642 2024-01-03 05:00:00 18.565689 2024-01-03 06:00:00 18.568652 2024-01-03 07:00:00 18.571534 2024-01-03 08:00:00 18.574337 2024-01-03 09:00:00 18.577064 2024-01-03 10:00:00 18.579716 2024-01-03 11:00:00 18.582295 2024-01-03 12:00:00 18.584804 2024-01-03 13:00:00 18.587244 2024-01-03 14:00:00 18.589617 2024-01-03 15:00:00 18.591926 2024-01-03 16:00:00 18.594171 2024-01-03 17:00:00 18.596355 2024-01-03 18:00:00 18.598479 2024-01-03 19:00:00 18.600545 2024-01-03 20:00:00 18.602554 2024-01-03 21:00:00 18.604508 2024-01-03 22:00:00 18.606409 2024-01-03 23:00:00 18.608258 2024-01-04 00:00:00 18.610056 2024-01-04 01:00:00 18.611805 2024-01-04 02:00:00 18.613507 2024-01-04 03:00:00 18.615161 2024-01-04 04:00:00 18.616770 2024-01-04 05:00:00 18.618336 2024-01-04 06:00:00 18.619858 2024-01-04 07:00:00 18.621339 2024-01-04 08:00:00 18.622779 2024-01-04 09:00:00 18.624180 2024-01-04 10:00:00 18.625543 2024-01-04 11:00:00 18.626868 2024-01-04 12:00:00 18.628157 2024-01-04 13:00:00 18.629411 2024-01-04 14:00:00 18.630630 2024-01-04 15:00:00 18.631816 2024-01-04 16:00:00 18.632970 2024-01-04 17:00:00 18.634092 2024-01-04 18:00:00 18.635183 2024-01-04 19:00:00 18.636244 2024-01-04 20:00:00 18.637277 2024-01-04 21:00:00 18.638281 2024-01-04 22:00:00 18.639257 2024-01-04 23:00:00 18.640207 2024-01-05 00:00:00 18.641131 2024-01-05 01:00:00 18.642030 2024-01-05 02:00:00 18.642904 2024-01-05 03:00:00 18.643754 2024-01-05 04:00:00 18.644581 2024-01-05 05:00:00 18.645385 2024-01-05 06:00:00 18.646167 2024-01-05 07:00:00 18.646928 2024-01-05 08:00:00 18.647668 2024-01-05 09:00:00 18.648388 2024-01-05 10:00:00 18.649088 2024-01-05 11:00:00 18.649769 2024-01-05 12:00:00 18.650431 2024-01-05 13:00:00 18.651075 2024-01-05 14:00:00 18.651702 2024-01-05 15:00:00 18.652311 2024-01-05 16:00:00 18.652904 2024-01-05 17:00:00 18.653480 2024-01-05 18:00:00 18.654041 2024-01-05 19:00:00 18.654586 2024-01-05 20:00:00 18.655117 2024-01-05 21:00:00 18.655633 2024-01-05 22:00:00 18.656134 2024-01-05 23:00:00 18.656622 2024-01-06 00:00:00 18.657097 2024-01-06 01:00:00 18.657559 2024-01-06 02:00:00 18.658008 2024-01-06 03:00:00 18.658445 2024-01-06 04:00:00 18.658870 2024-01-06 05:00:00 18.659283 2024-01-06 06:00:00 18.659685 2024-01-06 07:00:00 18.660076 2024-01-06 08:00:00 18.660456 2024-01-06 09:00:00 18.660826 2024-01-06 10:00:00 18.661185 2024-01-06 11:00:00 18.661535 2024-01-06 12:00:00 18.661875 2024-01-06 13:00:00 18.662206 2024-01-06 14:00:00 18.662528 2024-01-06 15:00:00 18.662841 2024-01-06 16:00:00 18.663146 2024-01-06 17:00:00 18.663442 2024-01-06 18:00:00 18.663730 2024-01-06 19:00:00 18.664010 2024-01-06 20:00:00 18.664283 2024-01-06 21:00:00 18.664548 2024-01-06 22:00:00 18.664806 2024-01-06 23:00:00 18.665056 2024-01-07 00:00:00 18.665300 2024-01-07 01:00:00 18.665537 2024-01-07 02:00:00 18.665768 2024-01-07 03:00:00 18.665993 2024-01-07 04:00:00 18.666211 2024-01-07 05:00:00 18.666423 2024-01-07 06:00:00 18.666630 2024-01-07 07:00:00 18.666830 2024-01-07 08:00:00 18.667026 2024-01-07 09:00:00 18.667216 2024-01-07 10:00:00 18.667401 2024-01-07 11:00:00 18.667580 2024-01-07 12:00:00 18.667755 2024-01-07 13:00:00 18.667925 2024-01-07 14:00:00 18.668091 2024-01-07 15:00:00 18.668251 2024-01-07 16:00:00 18.668408 2024-01-07 17:00:00 18.668560 2024-01-07 18:00:00 18.668708 2024-01-07 19:00:00 18.668852 2024-01-07 20:00:00 18.668992 2024-01-07 21:00:00 18.669128 2024-01-07 22:00:00 18.669261 2024-01-07 23:00:00 18.669390 2024-01-08 00:00:00 18.669515 2024-01-08 01:00:00 18.669637 2024-01-08 02:00:00 18.669755 2024-01-08 03:00:00 18.669871 2024-01-08 04:00:00 18.669983 2024-01-08 05:00:00 18.670092 2024-01-08 06:00:00 18.670198 2024-01-08 07:00:00 18.670301 2024-01-08 08:00:00 18.670401 2024-01-08 09:00:00 18.670499 2024-01-08 10:00:00 18.670594 2024-01-08 11:00:00 18.670686 2024-01-08 12:00:00 18.670776 2024-01-08 13:00:00 18.670864 2024-01-08 14:00:00 18.670949 2024-01-08 15:00:00 18.671031 2024-01-08 16:00:00 18.671112 2024-01-08 17:00:00 18.671190 2024-01-08 18:00:00 18.671266 2024-01-08 19:00:00 18.671340 2024-01-08 20:00:00 18.671412 2024-01-08 21:00:00 18.671482 2024-01-08 22:00:00 18.671550 2024-01-08 23:00:00 18.671616 2024-01-09 00:00:00 18.671680 2024-01-09 01:00:00 18.671743 2024-01-09 02:00:00 18.671804 2024-01-09 03:00:00 18.671863 2024-01-09 04:00:00 18.671921 2024-01-09 05:00:00 18.671977 2024-01-09 06:00:00 18.672031 2024-01-09 07:00:00 18.672084 2024-01-09 08:00:00 18.672136 2024-01-09 09:00:00 18.672186 2024-01-09 10:00:00 18.672235 2024-01-09 11:00:00 18.672282 2024-01-09 12:00:00 18.672328 2024-01-09 13:00:00 18.672373 2024-01-09 14:00:00 18.672417 2024-01-09 15:00:00 18.672459 2024-01-09 16:00:00 18.672501 2024-01-09 17:00:00 18.672541 2024-01-09 18:00:00 18.672580 2024-01-09 19:00:00 18.672618 2024-01-09 20:00:00 18.672655 2024-01-09 21:00:00 18.672691 2024-01-09 22:00:00 18.672726 2024-01-09 23:00:00 18.672760 2024-01-10 00:00:00 18.672793 2024-01-10 01:00:00 18.672825 2024-01-10 02:00:00 18.672856 2024-01-10 03:00:00 18.672887 2024-01-10 04:00:00 18.672916 2024-01-10 05:00:00 18.672945 2024-01-10 06:00:00 18.672973 2024-01-10 07:00:00 18.673000 2024-01-10 08:00:00 18.673027 2024-01-10 09:00:00 18.673053 2024-01-10 10:00:00 18.673078 2024-01-10 11:00:00 18.673102 2024-01-10 12:00:00 18.673126 2024-01-10 13:00:00 18.673149 2024-01-10 14:00:00 18.673171 2024-01-10 15:00:00 18.673193 2024-01-10 16:00:00 18.673214 2024-01-10 17:00:00 18.673235 2024-01-10 18:00:00 18.673255 2024-01-10 19:00:00 18.673275 2024-01-10 20:00:00 18.673294 2024-01-10 21:00:00 18.673312 2024-01-10 22:00:00 18.673330 2024-01-10 23:00:00 18.673348 2024-01-11 00:00:00 18.673365 2024-01-11 01:00:00 18.673381 2024-01-11 02:00:00 18.673397 2024-01-11 03:00:00 18.673413 2024-01-11 04:00:00 18.673428 2024-01-11 05:00:00 18.673443 2024-01-11 06:00:00 18.673457 2024-01-11 07:00:00 18.673471 2024-01-11 08:00:00 18.673485 2024-01-11 09:00:00 18.673498 2024-01-11 10:00:00 18.673511 2024-01-11 11:00:00 18.673523 2024-01-11 12:00:00 18.673536 2024-01-11 13:00:00 18.673547 2024-01-11 14:00:00 18.673559 2024-01-11 15:00:00 18.673570 2024-01-11 16:00:00 18.673581 2024-01-11 17:00:00 18.673592 2024-01-11 18:00:00 18.673602 2024-01-11 19:00:00 18.673612 2024-01-11 20:00:00 18.673622 2024-01-11 21:00:00 18.673631 2024-01-11 22:00:00 18.673640 2024-01-11 23:00:00 18.673649 2024-01-12 00:00:00 18.673658 2024-01-12 01:00:00 18.673667 2024-01-12 02:00:00 18.673675 2024-01-12 03:00:00 18.673683 2024-01-12 04:00:00 18.673691 2024-01-12 05:00:00 18.673698 2024-01-12 06:00:00 18.673706 2024-01-12 07:00:00 18.673713 2024-01-12 08:00:00 18.673720 2024-01-12 09:00:00 18.673727 2024-01-12 10:00:00 18.673733 2024-01-12 11:00:00 18.673740 2024-01-12 12:00:00 18.673746 2024-01-12 13:00:00 18.673752 2024-01-12 14:00:00 18.673758 2024-01-12 15:00:00 18.673764 2024-01-12 16:00:00 18.673769 2024-01-12 17:00:00 18.673775 2024-01-12 18:00:00 18.673780 2024-01-12 19:00:00 18.673785 2024-01-12 20:00:00 18.673790 2024-01-12 21:00:00 18.673795 2024-01-12 22:00:00 18.673800 2024-01-12 23:00:00 18.673805 2024-01-13 00:00:00 18.673809 2024-01-13 01:00:00 18.673813 2024-01-13 02:00:00 18.673818 2024-01-13 03:00:00 18.673822 2024-01-13 04:00:00 18.673826 2024-01-13 05:00:00 18.673830 2024-01-13 06:00:00 18.673834 2024-01-13 07:00:00 18.673837 2024-01-13 08:00:00 18.673841 2024-01-13 09:00:00 18.673844 2024-01-13 10:00:00 18.673848 2024-01-13 11:00:00 18.673851 2024-01-13 12:00:00 18.673854 2024-01-13 13:00:00 18.673857 2024-01-13 14:00:00 18.673860 2024-01-13 15:00:00 18.673863 2024-01-13 16:00:00 18.673866 2024-01-13 17:00:00 18.673869 2024-01-13 18:00:00 18.673872 2024-01-13 19:00:00 18.673874 2024-01-13 20:00:00 18.673877 2024-01-13 21:00:00 18.673880 2024-01-13 22:00:00 18.673882 2024-01-13 23:00:00 18.673884 2024-01-14 00:00:00 18.673887 2024-01-14 01:00:00 18.673889 2024-01-14 02:00:00 18.673891 2024-01-14 03:00:00 18.673893 2024-01-14 04:00:00 18.673895 2024-01-14 05:00:00 18.673897 2024-01-14 06:00:00 18.673899 2024-01-14 07:00:00 18.673901 2024-01-14 08:00:00 18.673903 2024-01-14 09:00:00 18.673905 2024-01-14 10:00:00 18.673906 2024-01-14 11:00:00 18.673908 2024-01-14 12:00:00 18.673910 2024-01-14 13:00:00 18.673911 2024-01-14 14:00:00 18.673913 2024-01-14 15:00:00 18.673915 2024-01-14 16:00:00 18.673916 2024-01-14 17:00:00 18.673917 2024-01-14 18:00:00 18.673919 2024-01-14 19:00:00 18.673920 2024-01-14 20:00:00 18.673922 2024-01-14 21:00:00 18.673923 2024-01-14 22:00:00 18.673924 2024-01-14 23:00:00 18.673925 2024-01-15 00:00:00 18.673926 2024-01-15 01:00:00 18.673928 2024-01-15 02:00:00 18.673929 2024-01-15 03:00:00 18.673930 2024-01-15 04:00:00 18.673931 2024-01-15 05:00:00 18.673932 2024-01-15 06:00:00 18.673933 2024-01-15 07:00:00 18.673934 2024-01-15 08:00:00 18.673935 2024-01-15 09:00:00 18.673936 2024-01-15 10:00:00 18.673937 2024-01-15 11:00:00 18.673938 2024-01-15 12:00:00 18.673938 2024-01-15 13:00:00 18.673939 2024-01-15 14:00:00 18.673940 2024-01-15 15:00:00 18.673941 2024-01-15 16:00:00 18.673942 2024-01-15 17:00:00 18.673942 2024-01-15 18:00:00 18.673943 2024-01-15 19:00:00 18.673944 2024-01-15 20:00:00 18.673944 2024-01-15 21:00:00 18.673945 2024-01-15 22:00:00 18.673946 2024-01-15 23:00:00 18.673946 2024-01-16 00:00:00 18.673947 2024-01-16 01:00:00 18.673948 2024-01-16 02:00:00 18.673948 2024-01-16 03:00:00 18.673949 2024-01-16 04:00:00 18.673949 2024-01-16 05:00:00 18.673950 2024-01-16 06:00:00 18.673950 2024-01-16 07:00:00 18.673951 2024-01-16 08:00:00 18.673951 2024-01-16 09:00:00 18.673952 2024-01-16 10:00:00 18.673952 2024-01-16 11:00:00 18.673953 2024-01-16 12:00:00 18.673953 2024-01-16 13:00:00 18.673953 2024-01-16 14:00:00 18.673954 2024-01-16 15:00:00 18.673954 2024-01-16 16:00:00 18.673955 2024-01-16 17:00:00 18.673955 2024-01-16 18:00:00 18.673955 2024-01-16 19:00:00 18.673956 2024-01-16 20:00:00 18.673956 2024-01-16 21:00:00 18.673956 2024-01-16 22:00:00 18.673957 2024-01-16 23:00:00 18.673957 2024-01-17 00:00:00 18.673957 2024-01-17 01:00:00 18.673958 2024-01-17 02:00:00 18.673958 2024-01-17 03:00:00 18.673958 2024-01-17 04:00:00 18.673959 2024-01-17 05:00:00 18.673959 2024-01-17 06:00:00 18.673959 2024-01-17 07:00:00 18.673959 2024-01-17 08:00:00 18.673960 2024-01-17 09:00:00 18.673960 2024-01-17 10:00:00 18.673960 2024-01-17 11:00:00 18.673960 2024-01-17 12:00:00 18.673961 2024-01-17 13:00:00 18.673961 2024-01-17 14:00:00 18.673961 2024-01-17 15:00:00 18.673961 2024-01-17 16:00:00 18.673961 2024-01-17 17:00:00 18.673962 2024-01-17 18:00:00 18.673962 2024-01-17 19:00:00 18.673962 2024-01-17 20:00:00 18.673962 2024-01-17 21:00:00 18.673962 2024-01-17 22:00:00 18.673963 2024-01-17 23:00:00 18.673963 2024-01-18 00:00:00 18.673963 2024-01-18 01:00:00 18.673963 2024-01-18 02:00:00 18.673963 2024-01-18 03:00:00 18.673963 2024-01-18 04:00:00 18.673963 2024-01-18 05:00:00 18.673964 2024-01-18 06:00:00 18.673964 2024-01-18 07:00:00 18.673964 2024-01-18 08:00:00 18.673964 2024-01-18 09:00:00 18.673964 2024-01-18 10:00:00 18.673964 2024-01-18 11:00:00 18.673964 2024-01-18 12:00:00 18.673964 2024-01-18 13:00:00 18.673965 2024-01-18 14:00:00 18.673965 2024-01-18 15:00:00 18.673965 2024-01-18 16:00:00 18.673965 2024-01-18 17:00:00 18.673965 2024-01-18 18:00:00 18.673965 2024-01-18 19:00:00 18.673965 2024-01-18 20:00:00 18.673965 2024-01-18 21:00:00 18.673965 2024-01-18 22:00:00 18.673965 2024-01-18 23:00:00 18.673966 2024-01-19 00:00:00 18.673966 2024-01-19 01:00:00 18.673966 2024-01-19 02:00:00 18.673966 2024-01-19 03:00:00 18.673966 2024-01-19 04:00:00 18.673966 2024-01-19 05:00:00 18.673966 2024-01-19 06:00:00 18.673966 2024-01-19 07:00:00 18.673966 2024-01-19 08:00:00 18.673966 2024-01-19 09:00:00 18.673966 2024-01-19 10:00:00 18.673966 2024-01-19 11:00:00 18.673966 2024-01-19 12:00:00 18.673966 2024-01-19 13:00:00 18.673967 2024-01-19 14:00:00 18.673967 2024-01-19 15:00:00 18.673967 2024-01-19 16:00:00 18.673967 2024-01-19 17:00:00 18.673967 2024-01-19 18:00:00 18.673967 2024-01-19 19:00:00 18.673967 2024-01-19 20:00:00 18.673967 2024-01-19 21:00:00 18.673967 2024-01-19 22:00:00 18.673967 2024-01-19 23:00:00 18.673967 2024-01-20 00:00:00 18.673967 2024-01-20 01:00:00 18.673967 2024-01-20 02:00:00 18.673967 2024-01-20 03:00:00 18.673967 2024-01-20 04:00:00 18.673967 2024-01-20 05:00:00 18.673967 2024-01-20 06:00:00 18.673967 2024-01-20 07:00:00 18.673967 2024-01-20 08:00:00 18.673967 2024-01-20 09:00:00 18.673967 2024-01-20 10:00:00 18.673967 2024-01-20 11:00:00 18.673967 2024-01-20 12:00:00 18.673967 2024-01-20 13:00:00 18.673968 2024-01-20 14:00:00 18.673968 2024-01-20 15:00:00 18.673968 2024-01-20 16:00:00 18.673968 2024-01-20 17:00:00 18.673968 2024-01-20 18:00:00 18.673968 2024-01-20 19:00:00 18.673968 2024-01-20 20:00:00 18.673968 2024-01-20 21:00:00 18.673968 2024-01-20 22:00:00 18.673968 2024-01-20 23:00:00 18.673968 2024-01-21 00:00:00 18.673968 2024-01-21 01:00:00 18.673968 2024-01-21 02:00:00 18.673968 2024-01-21 03:00:00 18.673968 2024-01-21 04:00:00 18.673968 2024-01-21 05:00:00 18.673968 2024-01-21 06:00:00 18.673968 2024-01-21 07:00:00 18.673968 2024-01-21 08:00:00 18.673968 2024-01-21 09:00:00 18.673968 2024-01-21 10:00:00 18.673968 2024-01-21 11:00:00 18.673968 2024-01-21 12:00:00 18.673968 2024-01-21 13:00:00 18.673968 2024-01-21 14:00:00 18.673968 2024-01-21 15:00:00 18.673968 2024-01-21 16:00:00 18.673968 2024-01-21 17:00:00 18.673968 2024-01-21 18:00:00 18.673968 2024-01-21 19:00:00 18.673968 2024-01-21 20:00:00 18.673968 2024-01-21 21:00:00 18.673968 2024-01-21 22:00:00 18.673968 2024-01-21 23:00:00 18.673968 2024-01-22 00:00:00 18.673968 2024-01-22 01:00:00 18.673968 2024-01-22 02:00:00 18.673968 2024-01-22 03:00:00 18.673968 2024-01-22 04:00:00 18.673968 2024-01-22 05:00:00 18.673968 2024-01-22 06:00:00 18.673968 2024-01-22 07:00:00 18.673968 2024-01-22 08:00:00 18.673968 2024-01-22 09:00:00 18.673968 2024-01-22 10:00:00 18.673968 2024-01-22 11:00:00 18.673968 2024-01-22 12:00:00 18.673968 2024-01-22 13:00:00 18.673968 2024-01-22 14:00:00 18.673968 2024-01-22 15:00:00 18.673968 2024-01-22 16:00:00 18.673968 2024-01-22 17:00:00 18.673968 2024-01-22 18:00:00 18.673968 2024-01-22 19:00:00 18.673968 2024-01-22 20:00:00 18.673968 2024-01-22 21:00:00 18.673968 2024-01-22 22:00:00 18.673968 2024-01-22 23:00:00 18.673968 2024-01-23 00:00:00 18.673968 2024-01-23 01:00:00 18.673968 2024-01-23 02:00:00 18.673968 2024-01-23 03:00:00 18.673968 2024-01-23 04:00:00 18.673968 2024-01-23 05:00:00 18.673968 2024-01-23 06:00:00 18.673968 2024-01-23 07:00:00 18.673968 2024-01-23 08:00:00 18.673968 2024-01-23 09:00:00 18.673968 2024-01-23 10:00:00 18.673968 2024-01-23 11:00:00 18.673968 2024-01-23 12:00:00 18.673968 2024-01-23 13:00:00 18.673968 2024-01-23 14:00:00 18.673968 2024-01-23 15:00:00 18.673968 2024-01-23 16:00:00 18.673968 2024-01-23 17:00:00 18.673968 2024-01-23 18:00:00 18.673968 2024-01-23 19:00:00 18.673968 2024-01-23 20:00:00 18.673968 2024-01-23 21:00:00 18.673968 2024-01-23 22:00:00 18.673968 2024-01-23 23:00:00 18.673968 2024-01-24 00:00:00 18.673968 2024-01-24 01:00:00 18.673968 2024-01-24 02:00:00 18.673968 2024-01-24 03:00:00 18.673968 2024-01-24 04:00:00 18.673968 2024-01-24 05:00:00 18.673968 2024-01-24 06:00:00 18.673968 2024-01-24 07:00:00 18.673968 2024-01-24 08:00:00 18.673968 2024-01-24 09:00:00 18.673968 2024-01-24 10:00:00 18.673968 2024-01-24 11:00:00 18.673968 2024-01-24 12:00:00 18.673968 2024-01-24 13:00:00 18.673968 2024-01-24 14:00:00 18.673968 2024-01-24 15:00:00 18.673968 2024-01-24 16:00:00 18.673968 2024-01-24 17:00:00 18.673968 2024-01-24 18:00:00 18.673968 2024-01-24 19:00:00 18.673968 2024-01-24 20:00:00 18.673968 2024-01-24 21:00:00 18.673969 2024-01-24 22:00:00 18.673969 2024-01-24 23:00:00 18.673969 2024-01-25 00:00:00 18.673969 2024-01-25 01:00:00 18.673969 2024-01-25 02:00:00 18.673969 2024-01-25 03:00:00 18.673969 2024-01-25 04:00:00 18.673969 2024-01-25 05:00:00 18.673969 2024-01-25 06:00:00 18.673969 2024-01-25 07:00:00 18.673969 2024-01-25 08:00:00 18.673969 2024-01-25 09:00:00 18.673969 2024-01-25 10:00:00 18.673969 2024-01-25 11:00:00 18.673969 2024-01-25 12:00:00 18.673969 2024-01-25 13:00:00 18.673969 2024-01-25 14:00:00 18.673969 2024-01-25 15:00:00 18.673969 2024-01-25 16:00:00 18.673969 2024-01-25 17:00:00 18.673969 2024-01-25 18:00:00 18.673969 2024-01-25 19:00:00 18.673969 2024-01-25 20:00:00 18.673969 2024-01-25 21:00:00 18.673969 2024-01-25 22:00:00 18.673969 2024-01-25 23:00:00 18.673969 2024-01-26 00:00:00 18.673969 2024-01-26 01:00:00 18.673969 2024-01-26 02:00:00 18.673969 2024-01-26 03:00:00 18.673969 2024-01-26 04:00:00 18.673969 2024-01-26 05:00:00 18.673969 2024-01-26 06:00:00 18.673969 2024-01-26 07:00:00 18.673969 2024-01-26 08:00:00 18.673969 2024-01-26 09:00:00 18.673969 2024-01-26 10:00:00 18.673969 2024-01-26 11:00:00 18.673969 2024-01-26 12:00:00 18.673969 2024-01-26 13:00:00 18.673969 2024-01-26 14:00:00 18.673969 2024-01-26 15:00:00 18.673969 2024-01-26 16:00:00 18.673969 2024-01-26 17:00:00 18.673969 2024-01-26 18:00:00 18.673969 2024-01-26 19:00:00 18.673969 2024-01-26 20:00:00 18.673969 2024-01-26 21:00:00 18.673969 2024-01-26 22:00:00 18.673969 2024-01-26 23:00:00 18.673969 2024-01-27 00:00:00 18.673969 2024-01-27 01:00:00 18.673969 2024-01-27 02:00:00 18.673969 2024-01-27 03:00:00 18.673969 2024-01-27 04:00:00 18.673969 2024-01-27 05:00:00 18.673969 2024-01-27 06:00:00 18.673969 2024-01-27 07:00:00 18.673969 2024-01-27 08:00:00 18.673969 2024-01-27 09:00:00 18.673969 2024-01-27 10:00:00 18.673969 2024-01-27 11:00:00 18.673969 2024-01-27 12:00:00 18.673969 2024-01-27 13:00:00 18.673969 2024-01-27 14:00:00 18.673969 2024-01-27 15:00:00 18.673969 2024-01-27 16:00:00 18.673969 2024-01-27 17:00:00 18.673969 2024-01-27 18:00:00 18.673969 2024-01-27 19:00:00 18.673969 2024-01-27 20:00:00 18.673969 2024-01-27 21:00:00 18.673969 2024-01-27 22:00:00 18.673969 2024-01-27 23:00:00 18.673969 2024-01-28 00:00:00 18.673969 2024-01-28 01:00:00 18.673969 2024-01-28 02:00:00 18.673969 2024-01-28 03:00:00 18.673969 2024-01-28 04:00:00 18.673969 2024-01-28 05:00:00 18.673969 2024-01-28 06:00:00 18.673969 2024-01-28 07:00:00 18.673969 2024-01-28 08:00:00 18.673969 2024-01-28 09:00:00 18.673969 2024-01-28 10:00:00 18.673969 2024-01-28 11:00:00 18.673969 2024-01-28 12:00:00 18.673969 2024-01-28 13:00:00 18.673969 2024-01-28 14:00:00 18.673969 2024-01-28 15:00:00 18.673969 2024-01-28 16:00:00 18.673969 2024-01-28 17:00:00 18.673969 2024-01-28 18:00:00 18.673969 2024-01-28 19:00:00 18.673969 2024-01-28 20:00:00 18.673969 2024-01-28 21:00:00 18.673969 2024-01-28 22:00:00 18.673969 2024-01-28 23:00:00 18.673969 2024-01-29 00:00:00 18.673969 2024-01-29 01:00:00 18.673969 2024-01-29 02:00:00 18.673969 2024-01-29 03:00:00 18.673969 2024-01-29 04:00:00 18.673969 2024-01-29 05:00:00 18.673969 2024-01-29 06:00:00 18.673969 2024-01-29 07:00:00 18.673969 2024-01-29 08:00:00 18.673969 2024-01-29 09:00:00 18.673969 2024-01-29 10:00:00 18.673969 2024-01-29 11:00:00 18.673969 2024-01-29 12:00:00 18.673969 2024-01-29 13:00:00 18.673969 2024-01-29 14:00:00 18.673969 2024-01-29 15:00:00 18.673969 2024-01-29 16:00:00 18.673969 2024-01-29 17:00:00 18.673969 2024-01-29 18:00:00 18.673969 2024-01-29 19:00:00 18.673969 2024-01-29 20:00:00 18.673969 2024-01-29 21:00:00 18.673969 2024-01-29 22:00:00 18.673969 2024-01-29 23:00:00 18.673969 2024-01-30 00:00:00 18.673969 2024-01-30 01:00:00 18.673969 2024-01-30 02:00:00 18.673969 2024-01-30 03:00:00 18.673969 2024-01-30 04:00:00 18.673969 2024-01-30 05:00:00 18.673969 2024-01-30 06:00:00 18.673969 2024-01-30 07:00:00 18.673969 2024-01-30 08:00:00 18.673969 2024-01-30 09:00:00 18.673969 2024-01-30 10:00:00 18.673969 2024-01-30 11:00:00 18.673969 2024-01-30 12:00:00 18.673969 2024-01-30 13:00:00 18.673969 2024-01-30 14:00:00 18.673969 2024-01-30 15:00:00 18.673969 2024-01-30 16:00:00 18.673969 2024-01-30 17:00:00 18.673969 2024-01-30 18:00:00 18.673969 2024-01-30 19:00:00 18.673969 2024-01-30 20:00:00 18.673969 2024-01-30 21:00:00 18.673969 2024-01-30 22:00:00 18.673969 2024-01-30 23:00:00 18.673969 2024-01-31 00:00:00 18.673969 2024-01-31 01:00:00 18.673969 2024-01-31 02:00:00 18.673969 2024-01-31 03:00:00 18.673969 2024-01-31 04:00:00 18.673969 2024-01-31 05:00:00 18.673969 2024-01-31 06:00:00 18.673969 2024-01-31 07:00:00 18.673969 2024-01-31 08:00:00 18.673969 2024-01-31 09:00:00 18.673969 2024-01-31 10:00:00 18.673969 2024-01-31 11:00:00 18.673969 2024-01-31 12:00:00 18.673969 2024-01-31 13:00:00 18.673969 2024-01-31 14:00:00 18.673969 2024-01-31 15:00:00 18.673969 2024-01-31 16:00:00 18.673969 2024-01-31 17:00:00 18.673969 2024-01-31 18:00:00 18.673969 2024-01-31 19:00:00 18.673969 2024-01-31 20:00:00 18.673969 2024-01-31 21:00:00 18.673969 2024-01-31 22:00:00 18.673969 2024-01-31 23:00:00 18.673969 2024-02-01 00:00:00 18.673969 2024-02-01 01:00:00 18.673969 2024-02-01 02:00:00 18.673969 2024-02-01 03:00:00 18.673969 2024-02-01 04:00:00 18.673969 2024-02-01 05:00:00 18.673969 2024-02-01 06:00:00 18.673969 2024-02-01 07:00:00 18.673969 2024-02-01 08:00:00 18.673969 2024-02-01 09:00:00 18.673969 2024-02-01 10:00:00 18.673969 2024-02-01 11:00:00 18.673969 2024-02-01 12:00:00 18.673969 2024-02-01 13:00:00 18.673969 2024-02-01 14:00:00 18.673969 2024-02-01 15:00:00 18.673969 2024-02-01 16:00:00 18.673969 2024-02-01 17:00:00 18.673969 2024-02-01 18:00:00 18.673969 2024-02-01 19:00:00 18.673969 2024-02-01 20:00:00 18.673969 2024-02-01 21:00:00 18.673969 2024-02-01 22:00:00 18.673969 2024-02-01 23:00:00 18.673969 2024-02-02 00:00:00 18.673969 2024-02-02 01:00:00 18.673969 2024-02-02 02:00:00 18.673969 2024-02-02 03:00:00 18.673969 2024-02-02 04:00:00 18.673969 2024-02-02 05:00:00 18.673969 2024-02-02 06:00:00 18.673969 2024-02-02 07:00:00 18.673969 2024-02-02 08:00:00 18.673969 2024-02-02 09:00:00 18.673969 2024-02-02 10:00:00 18.673969 2024-02-02 11:00:00 18.673969 2024-02-02 12:00:00 18.673969 2024-02-02 13:00:00 18.673969 2024-02-02 14:00:00 18.673969 2024-02-02 15:00:00 18.673969 2024-02-02 16:00:00 18.673969 2024-02-02 17:00:00 18.673969 2024-02-02 18:00:00 18.673969 2024-02-02 19:00:00 18.673969 2024-02-02 20:00:00 18.673969 2024-02-02 21:00:00 18.673969 2024-02-02 22:00:00 18.673969 2024-02-02 23:00:00 18.673969 2024-02-03 00:00:00 18.673969 2024-02-03 01:00:00 18.673969 2024-02-03 02:00:00 18.673969 2024-02-03 03:00:00 18.673969 2024-02-03 04:00:00 18.673969 2024-02-03 05:00:00 18.673969 2024-02-03 06:00:00 18.673969 2024-02-03 07:00:00 18.673969 2024-02-03 08:00:00 18.673969 2024-02-03 09:00:00 18.673969 2024-02-03 10:00:00 18.673969 2024-02-03 11:00:00 18.673969 2024-02-03 12:00:00 18.673969 2024-02-03 13:00:00 18.673969 2024-02-03 14:00:00 18.673969 2024-02-03 15:00:00 18.673969 2024-02-03 16:00:00 18.673969 2024-02-03 17:00:00 18.673969 2024-02-03 18:00:00 18.673969 2024-02-03 19:00:00 18.673969 2024-02-03 20:00:00 18.673969 2024-02-03 21:00:00 18.673969 2024-02-03 22:00:00 18.673969 2024-02-03 23:00:00 18.673969 2024-02-04 00:00:00 18.673969 2024-02-04 01:00:00 18.673969 2024-02-04 02:00:00 18.673969 2024-02-04 03:00:00 18.673969 2024-02-04 04:00:00 18.673969 2024-02-04 05:00:00 18.673969 2024-02-04 06:00:00 18.673969 2024-02-04 07:00:00 18.673969 2024-02-04 08:00:00 18.673969 2024-02-04 09:00:00 18.673969 2024-02-04 10:00:00 18.673969 2024-02-04 11:00:00 18.673969 2024-02-04 12:00:00 18.673969 2024-02-04 13:00:00 18.673969 2024-02-04 14:00:00 18.673969 2024-02-04 15:00:00 18.673969 2024-02-04 16:00:00 18.673969 2024-02-04 17:00:00 18.673969 2024-02-04 18:00:00 18.673969 2024-02-04 19:00:00 18.673969 2024-02-04 20:00:00 18.673969 2024-02-04 21:00:00 18.673969 2024-02-04 22:00:00 18.673969 2024-02-04 23:00:00 18.673969 2024-02-05 00:00:00 18.673969 2024-02-05 01:00:00 18.673969 2024-02-05 02:00:00 18.673969 2024-02-05 03:00:00 18.673969 2024-02-05 04:00:00 18.673969 2024-02-05 05:00:00 18.673969 2024-02-05 06:00:00 18.673969 2024-02-05 07:00:00 18.673969 2024-02-05 08:00:00 18.673969 2024-02-05 09:00:00 18.673969 2024-02-05 10:00:00 18.673969 2024-02-05 11:00:00 18.673969 2024-02-05 12:00:00 18.673969 2024-02-05 13:00:00 18.673969 2024-02-05 14:00:00 18.673969 Freq: H, Name: predicted_mean, dtype: float64
####
forecast_steps = len(data_crime_resampled_test)
forecast_diff = arima_res.forecast(steps=forecast_steps)
conf_int = arima_res.conf_int
forecast_g = arima_res.get_forecast()
conf_int = forecast_g.conf_int()
last_observed_value =1 # Last observed value in the original series
forecast_original_scale = forecast_diff + last_observed_value
plt.figure(figsize=(12, 3))
plt.plot(data_crime_resampled_test.index, data_crime_resampled_test['OccDateTimeCombined'], label='Test Data')
plt.plot(data_crime_resampled_test.index, forecast_original_scale, label='Forecast', color='red')
plt.fill_between(pd.date_range(start=data_crime_resampled.index[-1], periods=steps, freq='H'), conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.5, label='95% CI')
plt.xlabel('Time')
plt.ylabel('Original Scale')
plt.title('ARIMA Forecast on Test Data')
plt.legend()
plt.show()
plt.figure(figsize=(12, 3))
plt.plot(data_crime_resampled_train.index, data_crime_resampled_train['OccDateTimeCombined'], label='Training Data')
plt.plot(data_crime_resampled_test.index, data_crime_resampled_test['OccDateTimeCombined'], label='Test Data')
plt.plot(data_crime_resampled_test.index, forecast_original_scale, label='Forecast', color='red')
plt.fill_between(pd.date_range(start=data_crime_resampled.index[-1], periods=steps, freq='H'), conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='pink', alpha=0.5, label='95% CI')
plt.xlabel('Time')
plt.ylabel('Original Scale')
plt.title('ARIMA Forecast on Test Data')
plt.legend()
plt.show()
# Anomaly Detector
#Flattening data for each day's 24 hr break down
crime_24hr_data = crime_data.groupby(['DATE OCC','time occurance bins h'])['time occurance bins h'].count().unstack()
crime_24hr_data.head()
| time occurance bins h | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DATE OCC | ||||||||||||||||||||||||
| 2020-01-01 | 157.0 | 49.0 | 39.0 | 25.0 | 23.0 | 22.0 | 20.0 | 21.0 | 68.0 | 34.0 | 31.0 | 18.0 | 180.0 | 27.0 | 32.0 | 37.0 | 26.0 | 27.0 | 44.0 | 19.0 | 35.0 | 24.0 | 20.0 | 20.0 |
| 2020-01-02 | 13.0 | 6.0 | 4.0 | 12.0 | 3.0 | 6.0 | 8.0 | 12.0 | 19.0 | 14.0 | 18.0 | 18.0 | 33.0 | 29.0 | 29.0 | 19.0 | 15.0 | 23.0 | 26.0 | 29.0 | 24.0 | 23.0 | 15.0 | 20.0 |
| 2020-01-03 | 17.0 | 14.0 | 9.0 | 6.0 | 8.0 | 11.0 | 10.0 | 16.0 | 17.0 | 11.0 | 19.0 | 17.0 | 40.0 | 20.0 | 23.0 | 29.0 | 24.0 | 34.0 | 23.0 | 25.0 | 20.0 | 29.0 | 25.0 | 17.0 |
| 2020-01-04 | 20.0 | 17.0 | 15.0 | 1.0 | 13.0 | 4.0 | 8.0 | 6.0 | 10.0 | 18.0 | 29.0 | 28.0 | 27.0 | 29.0 | 17.0 | 24.0 | 21.0 | 23.0 | 17.0 | 21.0 | 20.0 | 25.0 | 25.0 | 22.0 |
| 2020-01-05 | 18.0 | 14.0 | 9.0 | 8.0 | 4.0 | 3.0 | 6.0 | 8.0 | 18.0 | 15.0 | 15.0 | 10.0 | 27.0 | 23.0 | 24.0 | 23.0 | 17.0 | 22.0 | 19.0 | 21.0 | 37.0 | 16.0 | 19.0 | 19.0 |
# Calculating threshold for anomolies
anomoly_detection_data = {}
for i in range(1,25):
val = {}
mean = crime_24hr_data[i].mean()
std_dev = crime_24hr_data[i].std()
val['threshold'] = mean + (5 * std_dev)
val['max'] = crime_24hr_data[1].max()
anomoly_detection_data[i] = val
print(anomoly_detection_data)
{1: {'threshold': 88.20638547810272, 'max': 157.0}, 2: {'threshold': 47.32317999297805, 'max': 157.0}, 3: {'threshold': 36.619696515679834, 'max': 157.0}, 4: {'threshold': 29.92671791432438, 'max': 157.0}, 5: {'threshold': 24.474054674064906, 'max': 157.0}, 6: {'threshold': 23.098331520283786, 'max': 157.0}, 7: {'threshold': 67.9526243555022, 'max': 157.0}, 8: {'threshold': 41.73033791262443, 'max': 157.0}, 9: {'threshold': 53.10161255652651, 'max': 157.0}, 10: {'threshold': 43.568294867639686, 'max': 157.0}, 11: {'threshold': 47.979101260819704, 'max': 157.0}, 12: {'threshold': 48.43904918755898, 'max': 157.0}, 13: {'threshold': 106.53596885339608, 'max': 157.0}, 14: {'threshold': 48.3224916173091, 'max': 157.0}, 15: {'threshold': 49.34446537581046, 'max': 157.0}, 16: {'threshold': 53.04121726048242, 'max': 157.0}, 17: {'threshold': 51.58074698771022, 'max': 157.0}, 18: {'threshold': 54.09345880665861, 'max': 157.0}, 19: {'threshold': 54.00424091549023, 'max': 157.0}, 20: {'threshold': 50.98831448657649, 'max': 157.0}, 21: {'threshold': 53.164381814889154, 'max': 157.0}, 22: {'threshold': 48.62881797171484, 'max': 157.0}, 23: {'threshold': 51.54764102116458, 'max': 157.0}, 24: {'threshold': 48.04124854937449, 'max': 157.0}}
# anomaly detector and Test
from collections import Counter
from datetime import datetime
def detect_anomoly(area, crime_occured_dates_list):
date_incident_holder = Counter(crime_occured_dates_list)
for date, incidents in date_incident_holder.items():
date_time_object = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
hour = date_time_object.hour + 1
if(incidents > anomoly_detection_data[hour]['max']):
print(area,":", date, "ALERT! It's higher than usual")
elif(incidents > anomoly_detection_data[hour]['threshold']):
print(area,":", date, "WARNING! ")
# TESTING
crime_occured_dates_list_1 = ['2020-02-08 18:00:00']*200 + ['2020-02-08 19:00:00']*2
crime_occured_dates_list_2 = ['2020-02-08 17:00:00']*10 + ['2020-02-08 16:00:00']*55
area_datelist = {"Central":crime_occured_dates_list_1, "77th street": crime_occured_dates_list_2}
for area, crime_occured_dates_list in area_datelist.items():
detect_anomoly(area, crime_occured_dates_list)
Central : 2020-02-08 18:00:00 ALERT! It's higher than usual 77th street : 2020-02-08 16:00:00 WARNING!
# Daily data
grouped_q5 = crime_data.groupby(['DATE OCC']).size().reset_index(name='COUNT')
# Monthly data
grouped_q5_monthly = grouped_q5.groupby(pd.Grouper(key='DATE OCC', freq='M')).sum()
# Reset the index to make the 'DATE OCC' column a regular column
grouped_q5_monthly.reset_index(inplace=True)
# Drop the last observation of February 2024, because this month just have 5 days of crime.
grouped_q5_monthly.drop([49], inplace=True)
# Interactive plot to visualize the time series data
base = alt.Chart(grouped_q5_monthly).mark_line(color='black', opacity=0.8).encode(
alt.X('DATE OCC'),
alt.Y('COUNT'),
tooltip = ['DATE OCC', 'COUNT']
).properties(width=800,
height=200,
title='Monthly Number of Crimes')
brush = alt.selection_interval(encodings=['x'])
lower = base.properties(height=60).add_params(brush)
upper = base.encode(alt.X('DATE OCC:T', scale=alt.Scale(domain=brush), title=None))
# Box plot
boxplot = alt.Chart(grouped_q5_monthly).mark_boxplot(color='black', size=100).encode(
alt.X('COUNT:Q', title='').scale(zero=False)
).properties(
width=400,
height=200,
title='Boxplot of the Monthly Number of Crimes'
)
# Histogram
histogram = alt.Chart(grouped_q5_monthly).mark_bar(color='black').encode(
alt.X('COUNT', bin=alt.Bin(maxbins=20), title=""),
alt.Y('count()', title="")
).properties(
width=400,
height=200,
title='Distribution of the Monthly Number of Crimes'
)
upper & lower
histogram | boxplot
# --- Seasonal decomposition using STL ---
# Set 'DATE OCC' as the index of the DataFrame
monthly_df = grouped_q5_monthly.copy()
monthly_df.set_index('DATE OCC', inplace=True)
# Perform seasonal decomposition using STL
stl = STL(monthly_df['COUNT'], period=30)
result = stl.fit()
# Adjust figure size and font size
plt.rcParams['figure.figsize'] = (10, 5) # Set figure width to 15 inches and height to 5 inches
plt.rcParams['font.size'] = 10 # Set font size to 10
# Create subplots
plt.subplot(4, 1, 1)
plt.plot(monthly_df.index, monthly_df['COUNT'], color='black')
plt.title('Original Data')
plt.xlabel('')
plt.subplot(4, 1, 2)
plt.plot(result.trend, color='black')
plt.title('Trend')
plt.xlabel('')
plt.subplot(4, 1, 3)
plt.plot(result.seasonal, color='black', linestyle='--')
plt.title('Seasonal')
plt.xlabel('')
plt.subplot(4, 1, 4)
plt.plot(result.resid, color='black', linestyle='-.')
plt.title('Residual')
plt.xlabel('')
plt.tight_layout() # Adjust layout
plt.show()
# --- Training and Testing Sets ---
# Set the cutoff point for the split (e.g., 80% of data for training, 20% for testing)
train_size = int(0.8 * len(grouped_q5_monthly)) # 80% of data for training, 20% for testing
# Split data into training and testing sets based on the cutoff point
train_data = grouped_q5_monthly.iloc[:train_size]
test_data = grouped_q5_monthly.iloc[train_size:]
# Set 'DATE OCC' as the index of the DataFrame
train_data.set_index('DATE OCC', inplace=True)
test_data.set_index('DATE OCC', inplace=True)
# --- Perform ADF test on the training data ---
adf_result = adfuller(train_data['COUNT'].dropna())
# Print the test statistic and p-value
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
# Check for stationarity based on the p-value
if adf_result[1] < 0.05:
print('The time series is stationary (reject the null hypothesis)')
else:
print('The time series is non-stationary (fail to reject the null hypothesis)')
ADF Statistic: -1.2963891402705094 p-value: 0.6308226629747651 The time series is non-stationary (fail to reject the null hypothesis)
# --- ACF and PACF (original data) ---
# Adjust figure size and font size
plt.rcParams['figure.figsize'] = (10, 2) # Set figure width to 15 inches and height to 5 inches
plt.rcParams['font.size'] = 10 # Set font size to 10
plot_acf(train_data['COUNT'], lags=15, color='black')
plt.title('Autocorrelation Function (ACF)')
plt.show()
plot_pacf(train_data['COUNT'], lags=15, method='ywm', color='black')
plt.title('Partial Autocorrelation Function (PACF)')
plt.show()
# First Difference
diff = train_data['COUNT'].diff().dropna()
# Check stationarity of the differenced series
print("Results of Dickey-Fuller Test after differencing:")
adf_diff = adfuller(diff)
print('ADF Statistic:', adf_diff[0])
print('p-value:', adf_diff[1])
# Check for stationarity based on the p-value
if adf_diff[1] < 0.05:
print('The time series is stationary (reject the null hypothesis)')
else:
print('The time series is non-stationary (fail to reject the null hypothesis)')
Results of Dickey-Fuller Test after differencing: ADF Statistic: -8.109940056440026 p-value: 1.2336039593224273e-12 The time series is stationary (reject the null hypothesis)
# --- Plot ACF and PACF of the differenced series ---
# Adjust figure size and font size
plt.rcParams['figure.figsize'] = (10, 2) # Set figure width to 15 inches and height to 5 inches
plt.rcParams['font.size'] = 10 # Set font size to 10
# Plot differenced series
plt.plot(diff, color='black')
plt.title('Differenced Series')
plt.show()
plot_acf(diff, lags=15, color='black')
plt.title('Autocorrelation Function (ACF) of Differenced Series')
plt.show()
plot_pacf(diff, lags=15, method='ywm', color='black')
plt.title('Partial Autocorrelation Function (PACF) of Differenced Series')
plt.show()
To select appropriate values for p and q based on the ACF and PACF plots of the differenced series:
p = 1: The lag value where the PACF plot crosses the upper confidence interval for the first time.
q = 1: The lag value where the ACF plot crosses the upper confidence interval for the first time.
# Fit ARIMA model
p = 1 # select based on PACF
q = 1 # select based on ACF
model = ARIMA(diff, order=(p, 1, q)) # Notice the order, we're using d=1 because we've differenced once
result = model.fit()
# Print model summary
print(result.summary())
SARIMAX Results
==============================================================================
Dep. Variable: COUNT No. Observations: 38
Model: ARIMA(1, 1, 1) Log Likelihood -296.518
Date: Sat, 23 Mar 2024 AIC 599.036
Time: 22:26:03 BIC 603.869
Sample: 02-29-2020 HQIC 600.740
- 03-31-2023
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.3612 0.177 -2.035 0.042 -0.709 -0.013
ma.L1 -0.8135 0.148 -5.490 0.000 -1.104 -0.523
sigma2 5.224e+05 1.24e+05 4.227 0.000 2.8e+05 7.65e+05
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 0.32
Prob(Q): 0.97 Prob(JB): 0.85
Heteroskedasticity (H): 0.74 Skew: 0.20
Prob(H) (two-sided): 0.61 Kurtosis: 2.77
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
# Simple Exponential Smoothing
model_ses = SimpleExpSmoothing(train_data['COUNT'])
fit_ses = model_ses.fit()
# Holt Two Parameter Smoothing
model_holt = ExponentialSmoothing(train_data['COUNT'], trend='add', seasonal='add', seasonal_periods=12)
fit_holt = model_holt.fit()
# --- Forecast future values on the test data ---
forecast_steps = len(test_data) # Adjust this according to the length of your test dataset
forecast_diff = result.forecast(steps=forecast_steps)
# Reverse differencing to get the forecasted values in the original scale
last_observed_value = train_data['COUNT'].iloc[-1] # Last observed value in the original series
forecast_original_scale = np.cumsum(forecast_diff) + last_observed_value
# --- Forecast Simple Exponential Smoothing and Double Exponential Smoothing ---
forecast_ses = fit_ses.forecast(steps=len(test_data))
forecast_holt = fit_holt.forecast(steps=len(test_data))
# --- Plotting comparison ---
plt.figure(figsize=(14, 3))
plt.plot(train_data.index, train_data['COUNT'], label='Training Data', color='black')
plt.plot(test_data.index, test_data['COUNT'], label='Test Data')
plt.plot(test_data.index, forecast_original_scale, label='ARIMA Forecast')
plt.plot(test_data.index, forecast_ses, label='Simple Exponential Smoothing Forecast')
plt.plot(test_data.index, forecast_holt, label='Holt Two Parameter Smoothing Forecast')
plt.legend()
plt.show()
# Compute MAE, MSE, RMSE
mae_arima = mean_absolute_error(test_data['COUNT'], forecast_original_scale)
mse_arima = mean_squared_error(test_data['COUNT'], forecast_original_scale)
rmse_arima = np.sqrt(mse_arima)
mae_ses = mean_absolute_error(test_data['COUNT'], forecast_ses)
mse_ses = mean_squared_error(test_data['COUNT'], forecast_ses)
rmse_ses = np.sqrt(mse_ses)
mae_holt = mean_absolute_error(test_data['COUNT'], forecast_holt)
mse_holt = mean_squared_error(test_data['COUNT'], forecast_holt)
rmse_holt = np.sqrt(mse_holt)
data = {
'Method': ['ARIMA', 'Simple Exponential Smoothing', 'Holt Two Parameter Smoothing'],
'MAE': [mae_arima, mae_ses, mae_holt],
'MSE': [mse_arima, mse_ses, mse_holt],
'RMSE': [rmse_arima, rmse_ses, rmse_holt]
}
df = pd.DataFrame(data)
print(df)
Method MAE MSE RMSE 0 ARIMA 571.829262 4.962665e+05 704.461837 1 Simple Exponential Smoothing 634.804258 7.210481e+05 849.145514 2 Holt Two Parameter Smoothing 903.206007 1.258033e+06 1121.620532
# ARIMA: Checking Normality for Residuals
# Get residuals
residuals = result.resid
# Plot residuals
plt.figure(figsize=(12, 3))
plt.plot(residuals, color='black')
plt.title('Residuals Plot')
plt.xlabel('Time')
plt.ylabel('Residuals')
plt.show()
# Plot ACF and PACF of residuals
fig, axes = plt.subplots(1, 2, figsize=(12, 3))
plot_acf(residuals, ax=axes[0], color='black')
plot_pacf(residuals, ax=axes[1], method='ywm', color='black')
plt.show()
# Plot histogram and Q-Q plot of residuals
plt.figure(figsize=(12, 3))
plt.subplot(1, 2, 1)
plt.hist(residuals, bins=15, density=True, color='black')
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.subplot(1, 2, 2)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Sample quantiles')
plt.show()
# Perform statistical tests for normality
normality_test = stats.shapiro(residuals)
print("Shapiro-Wilk test p-value:", normality_test[1])
Shapiro-Wilk test p-value: 0.8988532423973083
# --- One month ahead forecasting ---
# First Difference
full_diff = grouped_q5_monthly['COUNT'].diff().dropna()
# Fit ARIMA(1,1,1) model
model_arima = ARIMA(full_diff, order=(1,1,1))
fit_arima = model_arima.fit()
# Make one-month ahead forecast
forecast = fit_arima.forecast(steps=1)
# Reverse differencing to get the forecasted values in the original scale
last_observed_value = grouped_q5_monthly['COUNT'].iloc[-1] # Last observed value in the original series
forecast_original_scale = np.cumsum(forecast) + last_observed_value
# Plot original data and forecast
plt.figure(figsize=(16, 3))
# Plot line
plt.plot(grouped_q5_monthly['DATE OCC'], grouped_q5_monthly['COUNT'], label='Original Data', color='black', linestyle='-')
# Overlay points
plt.plot(grouped_q5_monthly['DATE OCC'], grouped_q5_monthly['COUNT'], 'o', color='black')
# Forecasting
plt.plot(grouped_q5_monthly['DATE OCC'].iloc[-1] + pd.DateOffset(months=1), forecast_original_scale , color='red', marker='o', markersize=5, label='Forecast Feb-24')
# Add text with the forecasted value
#plt.text(forecast_original_scale, f'{forecast_original_scale:.2f}', ha='right', va='bottom')
plt.legend()
plt.title('ARIMA(1,1,1) One-Month Ahead Forecast')
plt.xlabel('')
plt.ylabel('Number of Crimes')
plt.show()
# Compute evaluation metrics
actual_value = grouped_q5_monthly['COUNT'].iloc[-1]
mae = mean_absolute_error([actual_value], forecast)
mse = mean_squared_error([actual_value], forecast)
rmse = np.sqrt(mse)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
MAE: 11860.367745599011 MSE: 140668323.06084538 RMSE: 11860.367745599011
# Creating a table for Latino/Non-Latino and Hispanic Early Adulthood (22-40 years old) for both Genders (Male, Female)
filtered_df = crime_data[(crime_data['Victim_Sex_New'] != 'Unknown') & (crime_data['Victim_Age_New'] == 'Early Adulthood(22-40)')]
filtered_df['Ethnicity'] = filtered_df['Vict_Descent_New'].apply(lambda x: 'Latino and Hispanic' if x == 'Latino and Hispanic' else 'Non-Latino')
final_df = filtered_df[['Victim_Sex_New', 'Victim_Age_New', 'Ethnicity']]
final_df.head()
| Victim_Sex_New | Victim_Age_New | Ethnicity | |
|---|---|---|---|
| 3 | Male | Early Adulthood(22-40) | Latino and Hispanic |
| 5 | Male | Early Adulthood(22-40) | Latino and Hispanic |
| 6 | Female | Early Adulthood(22-40) | Non-Latino |
| 7 | Female | Early Adulthood(22-40) | Non-Latino |
| 8 | Male | Early Adulthood(22-40) | Latino and Hispanic |
# Observed Values
cont_table = pd.crosstab(final_df['Victim_Sex_New'], final_df['Ethnicity'])
cont_table
| Ethnicity | Latino and Hispanic | Non-Latino |
|---|---|---|
| Victim_Sex_New | ||
| Female | 70876 | 93763 |
| Male | 65465 | 94403 |
# Compute Chi-Square Statistic
chi2, p, dof, expected = chi2_contingency(cont_table)
# Determine Degrees of Freedom
degrees_of_freedom = (cont_table.shape[0] - 1) * (cont_table.shape[1] - 1)
# Expected values
expected_df = pd.DataFrame(expected, index=["Latino", "Non-Latino"], columns=["Male", "Female"])
# Set Significance Level
alpha = 0.05
# Compare Chi-Square Statistic with Critical Value
critical_value = chi2_contingency(np.ones(cont_table.shape))[0]
print("Chi-square Statistic:", chi2)
print("Degrees of Freedom:", degrees_of_freedom)
print("Critical Value:", critical_value)
print("p-value:", p)
if chi2 > critical_value:
print("Reject the null hypothesis. There is a significant association between being Latino in early adulthood and the incidence of crime.")
else:
print("Fail to reject the null hypothesis. There is no significant association between being Latino in early adulthood and the incidence of crime.")
print("\nExpected Frequencies:")
print(expected_df)
Chi-square Statistic: 146.725404162415
Degrees of Freedom: 1
Critical Value: 0.0
p-value: 9.010722488794873e-34
Reject the null hypothesis. There is a significant association between being Latino in early adulthood and the incidence of crime.
Expected Frequencies:
Male Female
Latino 69172.763296 95466.236704
Non-Latino 67168.236704 92699.763296
Starting locations are set to LAPD Police Stations coordinates
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from matplotlib.colors import ListedColormap
coords = np.column_stack((crime_data['LON'], crime_data['LAT']))
# Define initial centroids as coordinates of police stations
# Gathered by hand from https://lapdonlinestrgeacc.blob.core.usgovcloudapi.net/lapdonlinemedia/2021/12/LAPD-Area-Stations.pdf
station_coords = {
"Central": (34.04411080306554, -118.24748306155207),
"77th": (33.97037817485058, -118.27796869999999),
"Southwest": (34.010194018412385, -118.30497240388196),
"Pacific": (33.99169172013332, -118.41986776900023),
"Hollywood": (34.09580875431208, -118.33084255471671),
"Southeast": (33.93854705722783, -118.27542700471099),
"Olympic": (34.05039813056319, -118.29114073429325),
"N. Hollywood": (34.17177340492995, -118.3857370001728),
"Wilshire": (34.046614144300634, -118.34263238662076),
"Newton": (34.01245114514639, -118.25626651967428),
"Topanga": (34.22137305584511, -118.59932167541463),
"Rampart": (34.056744454029186, -118.26707559456186),
"West LA": (34.04384233779523, -118.45088588971068),
"Van Nuys": (34.18380155311523, -118.4449382748191),
"West Valley": (34.19353939842985, -118.54767086952694),
"Mission": (34.27336284901255, -118.46859495529038),
"Northeast": (34.11921601294294, -118.2493705313661),
"Devonshire": (34.256815467332956, -118.53044812398777),
"Harbor": (33.75764921706162, -118.28914932140042),
"Hollenbeck": (34.044762711360995, -118.21294130033142),
"Foothill": (34.25312609672471, -118.41042120698145)
}
initial_centroids = np.array([coord for coord in station_coords.values()])
# Kmeans clustering with custom initial centroids
kmeans = KMeans(n_clusters=21, init=initial_centroids, random_state=583)
kmeans.fit(coords)
cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_
# Calculate the density of each cluster
cluster_counts = np.bincount(cluster_labels)
cluster_density = cluster_counts / cluster_counts.sum()
# Sort clusters based on density
cluster_rank = np.argsort(cluster_density)[::-1]
# Reorder cluster labels according to density rank
cluster_labels_reordered = np.zeros_like(cluster_labels)
for i, rank in enumerate(cluster_rank):
cluster_labels_reordered[cluster_labels == rank] = i
plt.figure(figsize=(10, 8))
scatter = plt.scatter(crime_data['LON'], crime_data['LAT'], c=cluster_labels_reordered, cmap="viridis", s=20)
# Annotate the most dense cluster with 0, then sequentially number the clusters based on density
for i, center in enumerate(cluster_centers[cluster_rank]):
plt.annotate(f'{str(i)}', center, fontsize=12, color='red', weight='bold')
# Plot LAPD station names
for station, coord in station_coords.items():
plt.text(coord[1], coord[0], station, fontsize=8, color='white', ha='center', va='bottom')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('New Crime Clusters (KMeans)')
# Define a custom color map with the number of clusters
custom_cmap = ListedColormap(plt.cm.viridis(np.linspace(0, 1, 21)))
cbar = plt.colorbar(scatter, ticks=np.arange(21))
cbar.set_label('Cluster Number')
cbar.set_ticklabels(np.arange(21))
plt.show()
Uses KMeans results as starting locations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
# Assuming df_crime['LON'] and df_crime['LAT'] are your longitude and latitude data
coords = np.column_stack((crime_data['LON'], crime_data['LAT']))
# Perform K-means clustering
kmeans = KMeans(n_clusters=21)
kmeans.fit(coords)
# Get cluster centers and labels
cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_
# Use K-Means results as initial parameters for Mixture Model
gmm = GaussianMixture(n_components=21, covariance_type='full', means_init=cluster_centers)
gmm.fit(coords)
# Get GMM cluster labels
gmm_labels = gmm.predict(coords)
mm_clusts = np.sort(cluster_density)[::-1]
plt.figure(figsize=(10, 8))
plt.scatter(crime_data['LON'], crime_data['LAT'], c=gmm_labels, cmap='viridis')
# Annotate the cluster centers with their cluster numbers and densities
for i, (center, density) in enumerate(zip(gmm.means_, mm_clusts)):
plt.annotate(f'{i}', center, fontsize=12, color='red')
# Plot LAPD station names
for station, coord in station_coords.items():
plt.text(coord[1], coord[0], station, fontsize=8, color='white', ha='center', va='bottom')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Crime Clusters (Gaussian Mixture Model)')
cbar = plt.colorbar()
cbar.set_label('Density Level')
min_density = np.min(mm_clusts)
max_density = np.max(mm_clusts)
num_ticks = 10
tick_values = np.linspace(min_density, max_density, num_ticks)
tick_labels = [f'{density:.4f}' for density in tick_values]
tick_values = tick_values[::-1]
tick_labels = tick_labels[::-1]
cbar.set_ticks(np.linspace(0, len(mm_clusts), num_ticks))
cbar.set_ticklabels(tick_labels)
cbar.ax.invert_yaxis()
plt.show()
df_clusters_gmm = pd.DataFrame({"Density Value": mm_clusts})
df_clusters_gmm.index.name = 'Cluster Number'
df_clusters_gmm
| Density Value | |
|---|---|
| Cluster Number | |
| 0 | 0.178524 |
| 1 | 0.126187 |
| 2 | 0.111095 |
| 3 | 0.094406 |
| 4 | 0.054518 |
| 5 | 0.053017 |
| 6 | 0.045476 |
| 7 | 0.043438 |
| 8 | 0.035199 |
| 9 | 0.034086 |
| 10 | 0.032279 |
| 11 | 0.031000 |
| 12 | 0.030028 |
| 13 | 0.029547 |
| 14 | 0.022587 |
| 15 | 0.020930 |
| 16 | 0.020539 |
| 17 | 0.012264 |
| 18 | 0.009831 |
| 19 | 0.007763 |
| 20 | 0.007287 |
# Predict cluster labels for each data point
nearest_cluster_labels = kmeans.predict(coords)
cluster_area_mapping = {
0: "NEW_Devonshire",
1: "NEW_East_Valley",
2: "NEW_North_VanNuys",
3: "NEW_Topanga",
4: "NEW_Middle_Harbor",
5: "NEW_West_Devonshire",
6: "NEW_South_VanNuys",
7: "NEW_North_Hollywood",
8: "NEW_South_Harbor",
9: "NEW_East_Foothill",
10: "NEW_West_LA",
11: "NEW_77th",
12: "NEW_Pacific",
13: "NEW_Southwest",
14: "NEW_East_Central",
15: "NEW_Central",
16: "NEW_North_Harbor",
17: "NEW_West_Valley",
18: "NEW_Hollywood",
19: "NEW_Mission",
20: "NEW_South_Foothill",
}
crime_data['Kmeans Zones'] = np.vectorize(cluster_area_mapping.get)(nearest_cluster_labels)
#crime_data[['AREA NAME', 'Kmeans Zones']]
cluster_area_mapping = {
0: "NEW_Central",
1: "NEW_Hollywood",
2: "NEW_77th",
3: "NEW_Southwest",
4: "NEW_East_Central",
5: "NEW_West_LA",
6: "NEW_Pacific",
7: "NEW_North_VanNuys",
8: "NEW_South_VanNuys",
9: "NEW_North_Hollywood",
10: "NEW_East_Valley",
11: "NEW_West_Valley",
12: "NEW_South_Foothill",
13: "NEW_Mission",
14: "NEW_Topanga",
15: "NEW_Devonshire",
16: "NEW_North_Harbor",
17: "NEW_West_Devonshire",
18: "NEW_Middle_Harbor",
19: "NEW_South_Harbor",
20: "NEW_East_Foothill",
}
df_kmeans_clusts = pd.DataFrame({
"Cluster Rank": range(21),
"Density Value": [0.178524, 0.126187, 0.111095, 0.094406, 0.054518, 0.053017, 0.045476, 0.043438, 0.035199, 0.034086,
0.032279, 0.031000, 0.030028, 0.029547, 0.022587, 0.020930, 0.020539, 0.012264, 0.009831, 0.007763,
0.007287]
})
df_kmeans_clusts['Kmeans Zones'] = df_kmeans_clusts['Cluster Rank'].map(cluster_area_mapping)
df_kmeans_clusts
| Cluster Rank | Density Value | Kmeans Zones | |
|---|---|---|---|
| 0 | 0 | 0.178524 | NEW_Central |
| 1 | 1 | 0.126187 | NEW_Hollywood |
| 2 | 2 | 0.111095 | NEW_77th |
| 3 | 3 | 0.094406 | NEW_Southwest |
| 4 | 4 | 0.054518 | NEW_East_Central |
| 5 | 5 | 0.053017 | NEW_West_LA |
| 6 | 6 | 0.045476 | NEW_Pacific |
| 7 | 7 | 0.043438 | NEW_North_VanNuys |
| 8 | 8 | 0.035199 | NEW_South_VanNuys |
| 9 | 9 | 0.034086 | NEW_North_Hollywood |
| 10 | 10 | 0.032279 | NEW_East_Valley |
| 11 | 11 | 0.031000 | NEW_West_Valley |
| 12 | 12 | 0.030028 | NEW_South_Foothill |
| 13 | 13 | 0.029547 | NEW_Mission |
| 14 | 14 | 0.022587 | NEW_Topanga |
| 15 | 15 | 0.020930 | NEW_Devonshire |
| 16 | 16 | 0.020539 | NEW_North_Harbor |
| 17 | 17 | 0.012264 | NEW_West_Devonshire |
| 18 | 18 | 0.009831 | NEW_Middle_Harbor |
| 19 | 19 | 0.007763 | NEW_South_Harbor |
| 20 | 20 | 0.007287 | NEW_East_Foothill |
average_density = df_kmeans_clusts['Density Value'].mean()
plt.figure(figsize=(10, 8))
bars = plt.barh(df_kmeans_clusts['Kmeans Zones'], df_kmeans_clusts['Density Value'],
color=plt.cm.viridis_r(df_kmeans_clusts['Density Value'] / df_kmeans_clusts['Density Value'].max()))
plt.axvline(x=average_density, color='red', linestyle='--', label='Average Percentage')
plt.xlabel('Percentage of total crime (%)')
plt.ylabel('Kmeans Zones')
plt.title('Percentage Value by Kmeans Zones')
plt.legend()
plt.tight_layout()
plt.show()